不完整的关联规则
可以计算两条语句的相似度,但没实现自动获取词频的功能
import pandas as pd
from collections import Counter
doc1=['hotel','quiet','hotel','cheap','hotel',
'hotel','nice','hotel']
doc2=['quiet','hotel','nice']
doc3=['noise','hotel','cheap','hotel']
#提取所有单词
doc=doc1+doc2+doc3
volc=[]#语料库
for i in Counter(doc).keys():
volc.append(i)
df=pd.DataFrame(columns=volc)
#实现自动对应df列名的单词,获得出现的频率?
import numpy as np
query=[1,1,1,0,1]
vec1=[1,1,5,0,1]
vec2=[1,0,1,0,1]
vec3=[0,1,2,1,0]
#向dataframe添加行
df.loc[0]=vec1
df.loc[1]=vec2
df.loc[2]=vec3
df.loc[3]=query
#只能输入数字数据
def my_cos(se1,se2):
fenzi=0
for i in range(len(se1)):
f=se1[i]*se2[i]
fenzi=fenzi+f
y0=np.linalg.norm(se1)
y1=np.linalg.norm(se2)
coss=fenzi/(y0*y1)
print (coss)
my_cos(vec2,vec1)
#余弦相似度函数
from scipy.spatial.distance import cosine
s=cosine(doc2,doc1)
print (1-s)
#tf(f):文件d中单词t出现的次数,有时它就是tf
#df(nt) 包括这个单词的文件的个数
#idf log2(N/df) N是文件总数
#w权重 w=tf*idf
#算df,包括该单词的文件的数量
df[df.columns[0]].loc[0]
numl=[]
for j in range(len(vec1)):#列数
num=0
for q in range(len(df)):#行数
if df[df.columns[j]].loc[q] != 0:
num=num+1
numl.append(num)
#计算每一个单词的idf值
idfl=[np.log2(len(df)/c) for c in numl]
#计算每一条语句的w,也就是tf-idf向量
for t in range(len(df)):
w=np.multiply(df.loc[t].tolist(), idfl)
print(w)
df.loc[len(df)]=w
query=[1,1,1,0,1]
my_cos(query,df.loc[0])
my_cos(query,df.loc[1])
my_cos(query,df.loc[2])
my_cos(query,df.loc[3])
my_cos(query,df.loc[4])
my_cos(query,df.loc[5])
决策树
计算信息增益,选出影响Y的最大变量
#决策树,information gain,entropy
import pandas as pd
import numpy as np
data=pd.DataFrame(columns=('x1','x2','x3','y'))
data=[['yes','single','125k','no'],
['no','married','100k','no'],
['no','single','70k','no'],
['yes','married','120k','no'],
['no','divorced','95k','yes'],
['no','married','60k','no'],
['yes','divorced','220k','no'],
['no','single','85k','yes'],
['no','married','75k','no'],
['no','single','90k','yes']]
data=[['a1','b2','c2','1'],
['a1','b1','c2','2'],
['a2','b1','c1','1'],
['a2','b2','c3','1'],
['a2','b2','c2','1'],
['a2','b2','c1','2']]
def calcShannonEnt(data):
numEntires = len(data) #返回数据集的行数
labelCounts = {}
xlabelCounts = {} #保存每个标签(Label)出现次数的字典
for featVec in data: #对每组特征向量进行统计
currentLabel = featVec[-1] #提取标签(Label)信息
if currentLabel not in labelCounts.keys(): #如果标签(Label)没有放入统计次数的字典,添加进去
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1 #Label计
shannonEnt = 0.0
for key in labelCounts: #计算香农熵
prob = float(labelCounts[key]) / numEntires #选择该标签(Label)的概率
shannonEnt += -prob*np.log2(prob) #利用公式计算
#print('判别条件y的信息熵为:',shannonEnt) #返回经验熵(香农熵)
return shannonEnt
calcShannonEnt(data)
def IGain(data,n):
xCounts = {} #需要计算的x的变量的分类及个数的字典
for featVec in data:
xlabel = featVec[n]#看第几列的熵
if xlabel not in xCounts.keys():
xCounts[xlabel] = 0
xCounts[xlabel] += 1
wei_ent=0.0#最后权重后的该变量的熵
for label in list(xCounts.keys()):
xlabelCounts = {}#x变量某一个分类下的,y的分类及个数的字典
for i in range(len(data)):
if data[i][n] == label:#看第几列的熵
xcurrentLabel=data[i][-1]
if xcurrentLabel not in xlabelCounts.keys():
xlabelCounts[xcurrentLabel] = 0
xlabelCounts[xcurrentLabel] += 1
#print(sum(xlabelCounts.values()))
xnum=sum(xlabelCounts.values()) #x1变量不同类别,各类别的个数
xEnt = 0.0
for key in xlabelCounts:
prob = float(xlabelCounts[key]) / xnum
#print(prob)
xEnt += -prob*np.log2(prob)
print('判别变量x标签为',label,'条件y的信息熵为:',xEnt)
wei_ent += xnum/len(data)*xEnt
#print(wei_ent)
return wei_ent
IGain(data,1)
print('该标签的Information Gain为:',calcShannonEnt(data)-IGain(data,1))
朴素贝叶斯
给定条件,计算出现的概率
#朴素贝叶斯,information gain,entropy
import pandas as pd
import numpy as np
#data=pd.DataFrame(columns=('x1','x2','x3','y'))
data=[['yes','single','high','no'],
['no','married','middle','no'],
['no','single','low','no'],
['yes','married','middle','no'],
['no','divorced','middle','yes'],
['no','married','low','no'],
['yes','divorced','high','no'],
['no','single','low','yes'],
['no','married','low','no'],
['no','single','low','yes']]
#求单独的先验概率p(x|y)
def naivebayes(data,n,x,y):#n为第n列,x为第n列分类为x的值,y为标签项
numEntires = len(data) #返回数据集的行数
labelCounts = {} #y
for featVec in data:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
num=0
for i in range(len(data)):
if data[i][-1] == y and data[i][n] == x:
num = num + 1
#print (num/labelCounts[y])
print ('p(',y,')的概率是:',labelCounts[y]/numEntires)#p(y)
return (num/labelCounts[y])#p(x|y)
p1=naivebayes(data,0,'no','no')
print(p1)
p2=naivebayes(data,1,'divorced','no')
print(p2)
p3=naivebayes(data,2,'low','no')
print(p3)
#求p(y|x)=单独的p相乘并且*p(y)
print(p1*p2*p3)
版权声明:本文为weixin_42274933原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。