简单描述程序功能:
1.停用词为csv文件
2.源文件为txt文件
3.文本处理,将原文件中出现的停用词去除
代码实现:
1.文件读取,分词,源文件词频统计
python 读取 西班牙语文本编码: encoding=’ISO-8859-1′
1 #csv 文件读取,此处编码为西班牙语
2 defcsvfile():3 file_path = os.path.join(upload_path, “SpanishStopWords.csv”)4 with open(file_path,’r’,encoding=’ISO-8859-1′) as f:5 reader =csv.reader(f)6 fieldnames = next(reader)#获取数据的第一列,作为后续要转为字典的键名 生成器,next方法获取
7 #print(fieldnames)
8 data1=[]9 csv_reader = csv.DictReader(f,fieldnames=fieldnames) #self._fieldnames = fieldnames # list of keys for the dict 以list的形式存放键名
10 for row incsv_reader:11 dic1={}12 for k,v inrow.items():13 dic1[k]=v14 data1.append(dic1)15 returndata116 #txt文件读取
17 defeachcount():18 file_path = os.path.join(upload_path, “Alamo.txt”)19 txt = open(file_path, ‘r’, encoding=’ISO-8859-1′).read()20 #分词
21 txt = txt.replace(‘,’, ‘ ‘).replace(‘.’, ‘ ‘)22 txt =txt.split()23 counts = {} #定义一个空字典类型
24 print(txt)25 for word intxt:26 counts[word] = counts.get(word, 0) + 1 #获取word当前有几个,如果word不存在则为0
27 items =list(counts.items())28 #对一个列表按照键值对的两个元素的第二个元素进行排序,由大到小的倒排,词频排序
29 items.sort(key=lambda x: x[1], reverse=False)30 return items
2.显示在原文件中出现的所有停用词
#显示在源文件中出现过的所有停用词
@application.route(‘/listsearch/’, methods=[‘GET’, ‘POST’])
def listsearch():
file_path = os.path.join(upload_path, “SpanishStopWords.csv”)
txt = open(file_path, ‘r’, encoding=’ISO-8859-1′).read()
txt = txt.split()
filelist=txt
# filelist=csvfile()
filelist2=docu2()
# wordlist=[“my”,”name”,”boy”,”chirs”,”Dave”]
result=[]
result2=[]
# for j in wordlist:
# for i in filelist:
# if i[0]== j :
# result.append(i)
for j in filelist:
for i in filelist2:
if j== i :
result2.append(j)
return render_template(‘index.html’,result2=result2)
前端代码展现:
search
result
{% for line2 in result2 %}
{
{ line2}}
{% endfor %}