import numpy as np
np.set_printoptions(precision=4)
类别的编码
将类别变为从
0
开始计数;
from sklearn.preprocessing import LabelEncoder
>>> y = np.array(['r', 'g', 'g', 'b'])
>>> enc = LabelEncoder()
>>> enc.fit(y)
>>> y = enc.transform(y)
>>> y
array([2, 1, 1, 0], dtype=int32)
>>> enc.transform(['r', 'g', 'b'])
[2 1 0]
或者fit、transform两步并作一步
fit_transform
:
>>> y = np.array(['r', 'g', 'g', 'b'])
>>> enc = LabelEncoder()
>>> y = enc.fit_transform(y)
>>> y
array([2, 1, 1, 0], dtype=int32)
自然也有求逆的转换 (
inverse_transform
)(我们为了操作的 方便,将原本内含丰富的类别标签转换为以0开始的数字型标签,操作完成之后,我们希望能转换为原始的标签形态):
>>> enc.inverse_transform(y)
['r', 'g', 'g', 'b']
clone:同时保存原始和fitted后的模型
from sklearn.base import clone
self.classifiers_ = []
for clf in self.classifiers:
fitted_clf = clone(clf).fit(X, self.enc.transform(y))
self.classifiers_.append(fitted_clf)
clf.predict_proba()
from sklearn.naive_bayes import BernoulliNB
>>> classes = ['one','two','three','one','three']
>>> feature = [[0,1,1,0],[0,1,0,1],[1,1,0,0],[0,0,0,0],[0,1,1,1]]
>>> clf = BernoulliNB()
>>> clf.fit(feature, classes)
>>> P = clf.predict_proba(feature)
>>> P
[[ 0.5428 0.3619 0.0953]
[ 0.1958 0.3916 0.4126]
[ 0.2835 0.5671 0.1494]
[ 0.7154 0.159 0.1256]
[ 0.2467 0.4934 0.2599]]
>>> P.shape
(5, 3)
# n_samples * n_classes
>>> P.dot(np.ones(len(np.unique(classes))))
[ 1. 1. 1. 1. 1.]
# P的行和为1
# 也即P的每一行代表当前样本属于每一类的概率分布
scikit-learn uses the
predict_proba
method (if applicable) to compute the
ROC AUC score
. 关于ROC AUC 理解及实践的详细信息,见
[ROC曲线与AUC区域的理解与实践]
,
[分类模型的精确率(precision)与召回率(recall)(Python)]
fit
模型(model)从训练数据集中学习得到(模型所需)参数的过程即为
fit
class MajVoteClf(BaseEstimator, ClassifierMixin):
def __init__(self, classifiers):
self.classifiers = classifiers
def fit(self, X, y):
self.enc = LabelEncoder()
self.enc.fit(y)
self.classes = self.enc.classes
self.classifiers_ = []
for clf in self.classifiers:
fitted_clf = clone(clf).fit(X, self.enc.transform(y))
self.classifiers_.append(fitted_clf)
_name_estimators
from sklearn.pipeline import _name_estimators
>>> from sklearn.pipeline import _name_estimators
>>> help(_name_estimators)
Generate names for estimators.(及参数信息)
以二元tuple构成的list的形式返回,每一个list中的元素(二元tuple)对应于一个classifier。
>>> from sklearn.svm import SVC
>>> from sklearn.linear_model import LogisticRegression
>>> _name_estimators([SVC(), LogisticRegression()])
[
(
'logisticregression',
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0)
),
(
'svc',
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
)
]
接下来,我们将这些二元tuple构成的list转换为dict类型(非常实用的一种转换):
>>> named_clfs = {k:v for k, v in _name_classifiers([LogisticRegression(), SVC()])}
{
'logisticregression':
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0),
'svc':
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
}
版权声明:本文为lanchunhui原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。