我用的是Anaconda3来编译python 代码
# -*- coding: utf-8 -*- """ Created on Fri Mar 10 11:02:50 2017 @author: Administrator """ #用余弦相似性进行比较 import numpy as np #Numpy是Python的一个科学计算的库,提供了矩阵运算的功能 import jieba #python分词器 import copy import codecs,sys #自然语言编码转换 title2= "张翰怒斥耍大牌被换角谣言" title1 = "爱剪辑-危机四伏,落水就会被高压电弄死" #title1 = "王凯《跨界歌王》姗姗来迟“低音炮”开嗓献唱 - 搜狐视频" #title2 = "《跨界歌王》王凯清唱“好久不见” 低音炮名不虚传_视频在线观看 - 56.com" #title1 = "王凯《跨界歌王》姗姗来迟“低音炮”开嗓献唱 - 搜狐视频" #title2 = "范爷维权获赔15万全捐赠" #sampfn = "C:\\Users\\Administrator\\Desktop\\sample.txt" #定义了一个余弦相似度函数 def get_cossimi(x,y): myx = np.array(x) myy = np.array(y) cos1 = np.sum(myx * myy) cos21 = np.sqrt(sum(myx * myx)) cos22 = np.sqrt(sum(myy * myy)) return cos1 / (cos21 * cos22) if __name__ == '__main__': print("loading...") print("working...") #title1进行分词 f1_seg_list = jieba.cut(title1)#需要添加一个词典,来弥补结巴分词中没有的词语,从而保证更高的正确率 #title1进行分词 ftest1_seg_list = jieba.cut(title2) #打开停用词表 f_stop = codecs.open("C:\\Users\\Administrator\\Desktop\\stopword.txt","r","utf-8") try: f_stop_text = f_stop.read() finally: f_stop.close() f_stop_seg_list = f_stop_text.split("\n") test_words = {} all_words = {} for myword in f1_seg_list: #print(".") if not(myword.strip()) in f_stop_seg_list: test_words.setdefault(myword, 0) all_words.setdefault(myword, 0) all_words[myword] += 1 #read to be tested word mytest1_words = copy.deepcopy(test_words) for myword in ftest1_seg_list: # print(".") if not(myword.strip()) in f_stop_seg_list: if myword in mytest1_words: mytest1_words[myword] += 1 #calculate sample with to be tested text sample sampdate = [] test1data = [] for key in all_words.keys(): sampdate.append(all_words[key]) test1data.append(mytest1_words[key]) test1simi = get_cossimi(sampdate,test1data) print(u"title1与title2的余弦相似度%f"%(test1simi))
结果为:
runfile('C:/Users/Administrator/Desktop/cosine xiangsi/cosine simlitary.py', wdir='C:/Users/Administrator/Desktop/cosine xiangsi') Building prefix dict from the default dictionary ... Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache loading... working... Loading model cost 0.658 seconds. Prefix dict has been built succesfully. title1与title2的余弦相似度0.301511
我将代码用java进行封装
import org.python.util.PythonInterpreter; public class juli { public static void main(String args[]) { PythonInterpreter interpreter = new PythonInterpreter(); interpreter.execfile("D:\\download\\cosine simlitary.py"); }//main }
出现错误:
Exception in thread "main" Traceback (innermost last):
(no code object) at line 0
SyntaxError: ('Lexical error at line 20, column 35. Encountered: "\\r" (13), after : ""', ('D:\\download\\cosine simlitary.py', 20, 35, u'title1 = "\u95C4\u5823\u4F46\u741A\uE0A6\u734B\u93C4\uE21C\u5C13 \u8930\u64B3\u6E80\u9359\u6226\uE5DE\u93C6\u5B58\u5F3D\u93CB\u6941\u6D3F\u93C2\uFFFD'))可能是测试的句子中有空格出现
我进行了一些改正又出现了错误提示:
Exception in thread "main" Traceback (innermost last):
File "D:\download\cosine simlitary.py", line 9, in ?
ImportError: no module named numpy
仔细想了好久还是不行,看到了一篇博客,运用他的方法最后结果重要正确了 http://blog.csdn.net/ztf312/article/details/51338060
package cos; import java.io.BufferedReader; import java.io.InputStreamReader; public class juli { public static void main(String[] args){ try{ System.out.println("start"); Process pr = Runtime.getRuntime().exec("C:\\ProgramData\\Anaconda3\\python.exe untitled0.py"); BufferedReader in = new BufferedReader(new InputStreamReader(pr.getInputStream())); String line; while ((line = in.readLine()) != null) { System.out.println(line); } in.close(); pr.waitFor(); System.out.println("end"); } catch (Exception e){ e.printStackTrace(); } } }
结果如下:
start
loading...
working...
title1与title2的余弦相似度0.301511
end