Applying logistic regression for classifying human-machine dialogue and human-human dialogue (Part II)

Jessica / 2019-05-16 /

上上週用24個連續變數(continuous variables) (像是Different Words, Sentence Length Average,Function Word proportion…etc) 來分類人人對話及人機對話。這禮拜則嘗試看看用類別變數(categorical variables) 來分類。首先，必須將這些類別變數轉變為dummy variables(虛擬變數)。

from sklearn import preprocessing, linear_model
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_regression
plt.style.use('ggplot')
plt.rcParams['font.family']='SimHei' #⿊體

data = pd.read_excel('sample_50_anno.xlsx',sheet_name = None) 
DAdata=data.get('DA_tp') # get a specific sheet to DataFrame

這次的類別變數是11個dialogue act type。資料還是一樣是上週的100筆人人對話及人機對話(50筆人人，50筆人機)。每組對話都經過我人工標記，屬於哪個dialogue act type就在下面標1，沒有則標0。

下面的資料是dummy後的資料。

df=DAdata[['starter','Inf','Ans','offer','Sug','Req','Chk.Q','Pro.Q','Set.Q','Autopositive','Machine']]
df.head()

	starter	Ans	Req
0	1	0	0
1	0	1	0
2	0	0	1
3	0	1	0
4	1	0	0

x=df[['starter','Inf','Ans','offer','Sug','Req','Chk.Q','Pro.Q','Set.Q','Autopositive']]
y=df[['Machine']]

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=2021) 

x_train

	starter	Inf	Ans	offer	Sug	Req	Chk.Q	Pro.Q	Set.Q	Autopositive
173	0	1	0	0	0	0	0	0	1	0
112	0	0	0	0	0	0	0	0	1	0
179	0	1	0	0	0	0	0	0	0	0
134	1	0	0	0	0	0	0	0	0	0
180	0	0	0	0	0	0	0	1	0	0
66	0	0	0	0	1	0	0	0	0	0
34	1	0	0	0	0	1	0	0	0	0
169	0	1	0	0	0	0	0	0	0	0
77	1	0	0	0	0	0	0	0	0	0
45	0	0	0	0	0	0	0	0	1	0
6	1	0	0	0	0	0	0	0	0	0
163	1	0	0	0	0	0	0	0	0	0
30	1	0	0	0	0	0	0	0	0	0
73	1	0	0	0	0	0	0	0	0	0
65	1	0	0	0	0	1	0	0	0	0
68	0	1	0	0	0	0	0	0	0	0
170	1	0	0	0	0	0	0	0	0	0
75	0	0	0	0	0	0	0	1	0	0
12	1	0	0	0	0	0	0	0	0	0
130	0	0	0	0	0	0	0	1	0	0
104	1	0	0	0	0	0	0	0	0	0
81	0	0	0	0	0	1	0	0	0	0
86	0	1	0	0	0	0	0	0	0	0
69	0	0	0	0	0	0	0	1	0	0
139	0	0	1	0	0	0	0	0	1	0
177	0	0	1	0	0	0	0	0	0	0
202	0	0	0	0	0	0	0	0	1	0
88	0	0	0	0	0	0	0	0	1	0
145	0	0	0	0	0	0	0	0	1	0
98	0	0	1	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...
195	0	1	0	0	0	0	0	0	0	0
123	0	0	0	0	0	0	0	0	1	0
54	0	1	0	0	0	0	0	0	0	0
178	1	0	0	0	0	0	0	0	0	0
144	1	0	0	0	0	0	0	0	0	0
49	1	0	0	0	0	1	0	0	0	0
63	0	0	0	0	0	0	0	0	1	0
176	0	0	0	0	0	0	0	0	1	0
154	0	0	1	0	0	0	0	0	0	0
124	0	0	0	0	0	0	0	0	1	0
110	0	0	0	0	0	0	0	0	1	0
1	0	0	1	0	0	0	0	0	0	0
7	0	0	0	0	0	0	0	0	1	0
101	0	1	0	0	0	0	0	0	0	0
33	0	1	0	0	0	0	0	0	0	0
70	0	0	1	0	0	0	0	0	0	0
102	1	0	0	0	0	0	0	0	0	0
140	0	0	0	0	0	0	0	1	0	0
152	0	0	1	0	0	0	0	0	0	0
93	0	0	0	0	0	0	0	1	0	0
21	0	0	0	0	0	0	0	0	0	1
157	1	0	0	0	0	0	0	0	0	0
62	0	0	1	0	0	0	0	0	0	0
44	0	1	0	0	0	0	0	0	0	0
94	0	1	0	0	0	0	0	0	0	0
109	0	0	1	0	0	0	0	0	0	0
128	0	0	0	0	0	0	0	1	0	0
57	0	0	0	0	0	0	0	0	1	0
85	0	0	0	0	0	0	1	0	0	0
116	0	0	0	0	0	0	0	1	0	0

144 rows × 10 columns

from sklearn.preprocessing  import StandardScaler
sc=StandardScaler()

sc.fit(x_train)

x_train_nor=sc.transform(x_train)
x_test_nor=sc.transform(x_test)

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/preprocessing/data.py:645: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:6: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:7: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  import sys

from sklearn.linear_model  import LogisticRegression
import math
lr=LogisticRegression()
lr.fit(x_train_nor,y_train)

# 印出係數
print(lr.coef_)
#印出10個檢定變數的顯著性，以 P-value 是否小於 0.05（信心水準 95%）來判定
print(f_regression(x_train_nor,y_train)[1])
# 印出截距
print(lr.intercept_ )

[[-0.08006139 -0.44116689  0.08343295  0.         -0.45971731 -0.83366503
  -0.22135901 -0.09189166  0.35124534 -0.08632877]]
[0.50361434 0.01848812 0.24974631        nan 0.13953905 0.00446218
 0.27466972 0.8810979  0.00176625 0.95305294]
[-0.00500389]


/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/utils/validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/utils/validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/feature_selection/univariate_selection.py:299: RuntimeWarning: invalid value encountered in true_divide
  corr /= X_norms
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scipy/stats/_distn_infrastructure.py:877: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scipy/stats/_distn_infrastructure.py:877: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scipy/stats/_distn_infrastructure.py:1831: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)

np.round(lr.predict_proba(x_test_nor),3)

array([[0.461, 0.539],
       [0.498, 0.502],
       [0.684, 0.316],
       [0.236, 0.764],
       [0.367, 0.633],
       [0.684, 0.316],
       [0.461, 0.539],
       [0.367, 0.633],
       [0.367, 0.633],
       [0.684, 0.316],
       [0.236, 0.764],
       [0.732, 0.268],
       [0.367, 0.633],
       [0.684, 0.316],
       [0.236, 0.764],
       [0.684, 0.316],
       [0.461, 0.539],
       [0.684, 0.316],
       [0.461, 0.539],
       [0.367, 0.633],
       [0.684, 0.316],
       [0.367, 0.633],
       [0.461, 0.539],
       [0.236, 0.764],
       [0.236, 0.764],
       [0.367, 0.633],
       [0.498, 0.502],
       [0.972, 0.028],
       [0.991, 0.009],
       [0.981, 0.019],
       [0.973, 0.027],
       [0.598, 0.402],
       [0.461, 0.539],
       [0.236, 0.764],
       [0.367, 0.633],
       [0.684, 0.316],
       [0.236, 0.764],
       [0.367, 0.633],
       [0.367, 0.633],
       [0.236, 0.764],
       [0.461, 0.539],
       [0.684, 0.316],
       [0.461, 0.539],
       [0.461, 0.539],
       [0.684, 0.316],
       [0.367, 0.633],
       [0.236, 0.764],
       [0.972, 0.028],
       [0.461, 0.539],
       [0.684, 0.316],
       [0.498, 0.502],
       [0.416, 0.584],
       [0.973, 0.027],
       [0.498, 0.502],
       [0.461, 0.539],
       [0.236, 0.764],
       [0.684, 0.316],
       [0.639, 0.361],
       [0.723, 0.277],
       [0.684, 0.316],
       [0.236, 0.764],
       [0.236, 0.764]])

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

from sklearn.metrics import confusion_matrix
cnf=confusion_matrix(y_test, lr.predict(x_test_nor))
print('混淆矩陣：', cnf)

混淆矩陣： [[12 20]
 [11 19]]

import itertools
target_name=['yes','no']
plot_confusion_matrix(cnf,classes=target_name,title='confusion matrix')
plt.show()

Confusion matrix, without normalization
[[12 20]
 [11 19]]


/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/matplotlib/font_manager.py:1241: UserWarning: findfont: Font family ['SimHei'] not found. Falling back to DejaVu Sans.
  (prop.get_family(), self.defaultFamily[fontext]))

png

accuracy=(12+19)/(12+19+11+20)
print(accuracy)

0.5

precision=19/39
print(precision)

0.48717948717948717

#覆蓋率或者靈敏度
recall=19/30
print(recall)

0.6333333333333333

F1=0.61709401709/1.12051282051
print(F1)

0.5507246376789607

看來效果沒有上次好……

Reference * Scikit-learn——Naive Bayes

	starter	Ans	Req
0	1	0	0
1	0	1	0
2	0	0	1
3	0	1	0
4	1	0	0

	starter	Inf	Ans	offer	Sug	Req	Chk.Q	Pro.Q	Set.Q	Autopositive
173	0	1	0	0	0	0	0	0	1	0
112	0	0	0	0	0	0	0	0	1	0
179	0	1	0	0	0	0	0	0	0	0
134	1	0	0	0	0	0	0	0	0	0
180	0	0	0	0	0	0	0	1	0	0
66	0	0	0	0	1	0	0	0	0	0
34	1	0	0	0	0	1	0	0	0	0
169	0	1	0	0	0	0	0	0	0	0
77	1	0	0	0	0	0	0	0	0	0
45	0	0	0	0	0	0	0	0	1	0
6	1	0	0	0	0	0	0	0	0	0
163	1	0	0	0	0	0	0	0	0	0
30	1	0	0	0	0	0	0	0	0	0
73	1	0	0	0	0	0	0	0	0	0
65	1	0	0	0	0	1	0	0	0	0
68	0	1	0	0	0	0	0	0	0	0
170	1	0	0	0	0	0	0	0	0	0
75	0	0	0	0	0	0	0	1	0	0
12	1	0	0	0	0	0	0	0	0	0
130	0	0	0	0	0	0	0	1	0	0
104	1	0	0	0	0	0	0	0	0	0
81	0	0	0	0	0	1	0	0	0	0
86	0	1	0	0	0	0	0	0	0	0
69	0	0	0	0	0	0	0	1	0	0
139	0	0	1	0	0	0	0	0	1	0
177	0	0	1	0	0	0	0	0	0	0
202	0	0	0	0	0	0	0	0	1	0
88	0	0	0	0	0	0	0	0	1	0
145	0	0	0	0	0	0	0	0	1	0
98	0	0	1	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...
195	0	1	0	0	0	0	0	0	0	0
123	0	0	0	0	0	0	0	0	1	0
54	0	1	0	0	0	0	0	0	0	0
178	1	0	0	0	0	0	0	0	0	0
144	1	0	0	0	0	0	0	0	0	0
49	1	0	0	0	0	1	0	0	0	0
63	0	0	0	0	0	0	0	0	1	0
176	0	0	0	0	0	0	0	0	1	0
154	0	0	1	0	0	0	0	0	0	0
124	0	0	0	0	0	0	0	0	1	0
110	0	0	0	0	0	0	0	0	1	0
1	0	0	1	0	0	0	0	0	0	0
7	0	0	0	0	0	0	0	0	1	0
101	0	1	0	0	0	0	0	0	0	0
33	0	1	0	0	0	0	0	0	0	0
70	0	0	1	0	0	0	0	0	0	0
102	1	0	0	0	0	0	0	0	0	0
140	0	0	0	0	0	0	0	1	0	0
152	0	0	1	0	0	0	0	0	0	0
93	0	0	0	0	0	0	0	1	0	0
21	0	0	0	0	0	0	0	0	0	1
157	1	0	0	0	0	0	0	0	0	0
62	0	0	1	0	0	0	0	0	0	0
44	0	1	0	0	0	0	0	0	0	0
94	0	1	0	0	0	0	0	0	0	0
109	0	0	1	0	0	0	0	0	0	0
128	0	0	0	0	0	0	0	1	0	0
57	0	0	0	0	0	0	0	0	1	0
85	0	0	0	0	0	0	1	0	0	0
116	0	0	0	0	0	0	0	1	0	0

	starter	Ans	Req
0	1	0	0
1	0	1	0
2	0	0	1
3	0	1	0
4	1	0	0

	starter	Inf	Ans	offer	Sug	Req	Chk.Q	Pro.Q	Set.Q	Autopositive
173	0	1	0	0	0	0	0	0	1	0
112	0	0	0	0	0	0	0	0	1	0
179	0	1	0	0	0	0	0	0	0	0
134	1	0	0	0	0	0	0	0	0	0
180	0	0	0	0	0	0	0	1	0	0
66	0	0	0	0	1	0	0	0	0	0
34	1	0	0	0	0	1	0	0	0	0
169	0	1	0	0	0	0	0	0	0	0
77	1	0	0	0	0	0	0	0	0	0
45	0	0	0	0	0	0	0	0	1	0
6	1	0	0	0	0	0	0	0	0	0
163	1	0	0	0	0	0	0	0	0	0
30	1	0	0	0	0	0	0	0	0	0
73	1	0	0	0	0	0	0	0	0	0
65	1	0	0	0	0	1	0	0	0	0
68	0	1	0	0	0	0	0	0	0	0
170	1	0	0	0	0	0	0	0	0	0
75	0	0	0	0	0	0	0	1	0	0
12	1	0	0	0	0	0	0	0	0	0
130	0	0	0	0	0	0	0	1	0	0
104	1	0	0	0	0	0	0	0	0	0
81	0	0	0	0	0	1	0	0	0	0
86	0	1	0	0	0	0	0	0	0	0
69	0	0	0	0	0	0	0	1	0	0
139	0	0	1	0	0	0	0	0	1	0
177	0	0	1	0	0	0	0	0	0	0
202	0	0	0	0	0	0	0	0	1	0
88	0	0	0	0	0	0	0	0	1	0
145	0	0	0	0	0	0	0	0	1	0
98	0	0	1	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...
195	0	1	0	0	0	0	0	0	0	0
123	0	0	0	0	0	0	0	0	1	0
54	0	1	0	0	0	0	0	0	0	0
178	1	0	0	0	0	0	0	0	0	0
144	1	0	0	0	0	0	0	0	0	0
49	1	0	0	0	0	1	0	0	0	0
63	0	0	0	0	0	0	0	0	1	0
176	0	0	0	0	0	0	0	0	1	0
154	0	0	1	0	0	0	0	0	0	0
124	0	0	0	0	0	0	0	0	1	0
110	0	0	0	0	0	0	0	0	1	0
1	0	0	1	0	0	0	0	0	0	0
7	0	0	0	0	0	0	0	0	1	0
101	0	1	0	0	0	0	0	0	0	0
33	0	1	0	0	0	0	0	0	0	0
70	0	0	1	0	0	0	0	0	0	0
102	1	0	0	0	0	0	0	0	0	0
140	0	0	0	0	0	0	0	1	0	0
152	0	0	1	0	0	0	0	0	0	0
93	0	0	0	0	0	0	0	1	0	0
21	0	0	0	0	0	0	0	0	0	1
157	1	0	0	0	0	0	0	0	0	0
62	0	0	1	0	0	0	0	0	0	0
44	0	1	0	0	0	0	0	0	0	0
94	0	1	0	0	0	0	0	0	0	0
109	0	0	1	0	0	0	0	0	0	0
128	0	0	0	0	0	0	0	1	0	0
57	0	0	0	0	0	0	0	0	1	0
85	0	0	0	0	0	0	1	0	0	0
116	0	0	0	0	0	0	0	1	0	0

	starter	Ans	Req
0	1	0	0
1	0	1	0
2	0	0	1
3	0	1	0
4	1	0	0

	starter	Inf	Ans	offer	Sug	Req	Chk.Q	Pro.Q	Set.Q	Autopositive
173	0	1	0	0	0	0	0	0	1	0
112	0	0	0	0	0	0	0	0	1	0
179	0	1	0	0	0	0	0	0	0	0
134	1	0	0	0	0	0	0	0	0	0
180	0	0	0	0	0	0	0	1	0	0
66	0	0	0	0	1	0	0	0	0	0
34	1	0	0	0	0	1	0	0	0	0
169	0	1	0	0	0	0	0	0	0	0
77	1	0	0	0	0	0	0	0	0	0
45	0	0	0	0	0	0	0	0	1	0
6	1	0	0	0	0	0	0	0	0	0
163	1	0	0	0	0	0	0	0	0	0
30	1	0	0	0	0	0	0	0	0	0
73	1	0	0	0	0	0	0	0	0	0
65	1	0	0	0	0	1	0	0	0	0
68	0	1	0	0	0	0	0	0	0	0
170	1	0	0	0	0	0	0	0	0	0
75	0	0	0	0	0	0	0	1	0	0
12	1	0	0	0	0	0	0	0	0	0
130	0	0	0	0	0	0	0	1	0	0
104	1	0	0	0	0	0	0	0	0	0
81	0	0	0	0	0	1	0	0	0	0
86	0	1	0	0	0	0	0	0	0	0
69	0	0	0	0	0	0	0	1	0	0
139	0	0	1	0	0	0	0	0	1	0
177	0	0	1	0	0	0	0	0	0	0
202	0	0	0	0	0	0	0	0	1	0
88	0	0	0	0	0	0	0	0	1	0
145	0	0	0	0	0	0	0	0	1	0
98	0	0	1	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...
195	0	1	0	0	0	0	0	0	0	0
123	0	0	0	0	0	0	0	0	1	0
54	0	1	0	0	0	0	0	0	0	0
178	1	0	0	0	0	0	0	0	0	0
144	1	0	0	0	0	0	0	0	0	0
49	1	0	0	0	0	1	0	0	0	0
63	0	0	0	0	0	0	0	0	1	0
176	0	0	0	0	0	0	0	0	1	0
154	0	0	1	0	0	0	0	0	0	0
124	0	0	0	0	0	0	0	0	1	0
110	0	0	0	0	0	0	0	0	1	0
1	0	0	1	0	0	0	0	0	0	0
7	0	0	0	0	0	0	0	0	1	0
101	0	1	0	0	0	0	0	0	0	0
33	0	1	0	0	0	0	0	0	0	0
70	0	0	1	0	0	0	0	0	0	0
102	1	0	0	0	0	0	0	0	0	0
140	0	0	0	0	0	0	0	1	0	0
152	0	0	1	0	0	0	0	0	0	0
93	0	0	0	0	0	0	0	1	0	0
21	0	0	0	0	0	0	0	0	0	1
157	1	0	0	0	0	0	0	0	0	0
62	0	0	1	0	0	0	0	0	0	0
44	0	1	0	0	0	0	0	0	0	0
94	0	1	0	0	0	0	0	0	0	0
109	0	0	1	0	0	0	0	0	0	0
128	0	0	0	0	0	0	0	1	0	0
57	0	0	0	0	0	0	0	0	1	0
85	0	0	0	0	0	0	1	0	0	0
116	0	0	0	0	0	0	0	1	0	0