关联规则/决策树/贝叶斯的python实现

  • Post author:
  • Post category:python




不完整的关联规则

可以计算两条语句的相似度,但没实现自动获取词频的功能

import pandas as pd
from collections import Counter
doc1=['hotel','quiet','hotel','cheap','hotel',
      'hotel','nice','hotel']
doc2=['quiet','hotel','nice']
doc3=['noise','hotel','cheap','hotel']
#提取所有单词
doc=doc1+doc2+doc3
volc=[]#语料库
for i in Counter(doc).keys():
    volc.append(i)
df=pd.DataFrame(columns=volc)


#实现自动对应df列名的单词,获得出现的频率?
import numpy as np
query=[1,1,1,0,1]
vec1=[1,1,5,0,1]
vec2=[1,0,1,0,1]
vec3=[0,1,2,1,0]
#向dataframe添加行
df.loc[0]=vec1
df.loc[1]=vec2
df.loc[2]=vec3
df.loc[3]=query

#只能输入数字数据
def my_cos(se1,se2):
  fenzi=0
  for i in range(len(se1)):
      f=se1[i]*se2[i]
      fenzi=fenzi+f
  y0=np.linalg.norm(se1)
  y1=np.linalg.norm(se2)
  coss=fenzi/(y0*y1)
  print (coss)
my_cos(vec2,vec1)
  
#余弦相似度函数
from scipy.spatial.distance import cosine
s=cosine(doc2,doc1)
print (1-s)   


#tf(f):文件d中单词t出现的次数,有时它就是tf
#df(nt) 包括这个单词的文件的个数
#idf log2(N/df) N是文件总数
#w权重 w=tf*idf

#算df,包括该单词的文件的数量
df[df.columns[0]].loc[0]
numl=[]
for j in range(len(vec1)):#列数
    num=0
    for q in range(len(df)):#行数
          if df[df.columns[j]].loc[q] != 0:
              num=num+1
    numl.append(num)
#计算每一个单词的idf值
idfl=[np.log2(len(df)/c) for c in numl]   
#计算每一条语句的w,也就是tf-idf向量
for t in range(len(df)): 
    w=np.multiply(df.loc[t].tolist(), idfl)
    print(w)
    df.loc[len(df)]=w
    
query=[1,1,1,0,1]
my_cos(query,df.loc[0])
my_cos(query,df.loc[1])
my_cos(query,df.loc[2])
my_cos(query,df.loc[3])
my_cos(query,df.loc[4])
my_cos(query,df.loc[5])



决策树

计算信息增益,选出影响Y的最大变量

#决策树,information gain,entropy
import pandas as pd
import numpy as np
data=pd.DataFrame(columns=('x1','x2','x3','y'))
data=[['yes','single','125k','no'],
      ['no','married','100k','no'],
      ['no','single','70k','no'],
      ['yes','married','120k','no'],
      ['no','divorced','95k','yes'],
      ['no','married','60k','no'],
      ['yes','divorced','220k','no'],
      ['no','single','85k','yes'],
      ['no','married','75k','no'],
      ['no','single','90k','yes']]


data=[['a1','b2','c2','1'],
      ['a1','b1','c2','2'],
      ['a2','b1','c1','1'],
      ['a2','b2','c3','1'],
      ['a2','b2','c2','1'],
      ['a2','b2','c1','2']]


def calcShannonEnt(data):
    numEntires = len(data)                        #返回数据集的行数
    labelCounts = {} 
    xlabelCounts = {}                                #保存每个标签(Label)出现次数的字典
    for featVec in data:                          #对每组特征向量进行统计
        currentLabel = featVec[-1]                  #提取标签(Label)信息
        if currentLabel not in labelCounts.keys():   #如果标签(Label)没有放入统计次数的字典,添加进去
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1               #Label计        
    shannonEnt = 0.0                                     
    for key in labelCounts:                          #计算香农熵
        prob = float(labelCounts[key]) / numEntires  #选择该标签(Label)的概率
        shannonEnt += -prob*np.log2(prob)            #利用公式计算
    #print('判别条件y的信息熵为:',shannonEnt)                                #返回经验熵(香农熵) 
    return shannonEnt
calcShannonEnt(data)

def IGain(data,n):
  xCounts = {}  #需要计算的x的变量的分类及个数的字典
  for featVec in data:
        xlabel = featVec[n]#看第几列的熵
        if xlabel not in xCounts.keys():   
            xCounts[xlabel] = 0
        xCounts[xlabel] += 1
  wei_ent=0.0#最后权重后的该变量的熵
  for label in list(xCounts.keys()):
     xlabelCounts = {}#x变量某一个分类下的,y的分类及个数的字典
     for i in range(len(data)):
       if data[i][n] == label:#看第几列的熵
          xcurrentLabel=data[i][-1]
          if xcurrentLabel not in xlabelCounts.keys():   
            xlabelCounts[xcurrentLabel] = 0
          xlabelCounts[xcurrentLabel] += 1 
     #print(sum(xlabelCounts.values()))
     xnum=sum(xlabelCounts.values()) #x1变量不同类别,各类别的个数
     xEnt = 0.0                                     
     for key in xlabelCounts:                          
        prob = float(xlabelCounts[key]) / xnum  
        #print(prob)
        xEnt += -prob*np.log2(prob)            
     print('判别变量x标签为',label,'条件y的信息熵为:',xEnt)
     wei_ent += xnum/len(data)*xEnt
  #print(wei_ent)
  return wei_ent
IGain(data,1)
print('该标签的Information Gain为:',calcShannonEnt(data)-IGain(data,1))



朴素贝叶斯

给定条件,计算出现的概率

#朴素贝叶斯,information gain,entropy
import pandas as pd
import numpy as np
#data=pd.DataFrame(columns=('x1','x2','x3','y'))
data=[['yes','single','high','no'],
      ['no','married','middle','no'],
      ['no','single','low','no'],
      ['yes','married','middle','no'],
      ['no','divorced','middle','yes'],
      ['no','married','low','no'],
      ['yes','divorced','high','no'],
      ['no','single','low','yes'],
      ['no','married','low','no'],
      ['no','single','low','yes']]

#求单独的先验概率p(x|y)
def naivebayes(data,n,x,y):#n为第n列,x为第n列分类为x的值,y为标签项
   numEntires = len(data)                        #返回数据集的行数
   labelCounts = {} #y
   for featVec in data:                          
        currentLabel = featVec[-1]                  
        if currentLabel not in labelCounts.keys():   
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1               
   num=0
   for i in range(len(data)):
        if data[i][-1] == y and data[i][n] == x:
             num = num + 1
   #print (num/labelCounts[y])
   print ('p(',y,')的概率是:',labelCounts[y]/numEntires)#p(y)
   return (num/labelCounts[y])#p(x|y)
   
p1=naivebayes(data,0,'no','no')   
print(p1)
p2=naivebayes(data,1,'divorced','no')   
print(p2)
p3=naivebayes(data,2,'low','no')   
print(p3)
#求p(y|x)=单独的p相乘并且*p(y)
print(p1*p2*p3)



版权声明:本文为weixin_42274933原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。