python 多个文本 去停用词_python文本处理 数据挖掘 停用词检索

  • Post author:
  • Post category:python


简单描述程序功能:

1.停用词为csv文件

2.源文件为txt文件

3.文本处理,将原文件中出现的停用词去除

代码实现:

1.文件读取,分词,源文件词频统计

python 读取 西班牙语文本编码: encoding=’ISO-8859-1′

1 #csv 文件读取,此处编码为西班牙语

2 defcsvfile():3 file_path = os.path.join(upload_path, “SpanishStopWords.csv”)4 with open(file_path,’r’,encoding=’ISO-8859-1′) as f:5 reader =csv.reader(f)6 fieldnames = next(reader)#获取数据的第一列,作为后续要转为字典的键名 生成器,next方法获取

7 #print(fieldnames)

8 data1=[]9 csv_reader = csv.DictReader(f,fieldnames=fieldnames) #self._fieldnames = fieldnames # list of keys for the dict 以list的形式存放键名

10 for row incsv_reader:11 dic1={}12 for k,v inrow.items():13 dic1[k]=v14 data1.append(dic1)15 returndata116 #txt文件读取

17 defeachcount():18 file_path = os.path.join(upload_path, “Alamo.txt”)19 txt = open(file_path, ‘r’, encoding=’ISO-8859-1′).read()20 #分词

21 txt = txt.replace(‘,’, ‘ ‘).replace(‘.’, ‘ ‘)22 txt =txt.split()23 counts = {} #定义一个空字典类型

24 print(txt)25 for word intxt:26 counts[word] = counts.get(word, 0) + 1 #获取word当前有几个,如果word不存在则为0

27 items =list(counts.items())28 #对一个列表按照键值对的两个元素的第二个元素进行排序,由大到小的倒排,词频排序

29 items.sort(key=lambda x: x[1], reverse=False)30 return items

2.显示在原文件中出现的所有停用词

#显示在源文件中出现过的所有停用词

@application.route(‘/listsearch/’, methods=[‘GET’, ‘POST’])

def listsearch():

file_path = os.path.join(upload_path, “SpanishStopWords.csv”)

txt = open(file_path, ‘r’, encoding=’ISO-8859-1′).read()

txt = txt.split()

filelist=txt

# filelist=csvfile()

filelist2=docu2()

# wordlist=[“my”,”name”,”boy”,”chirs”,”Dave”]

result=[]

result2=[]

# for j in wordlist:

# for i in filelist:

# if i[0]== j :

# result.append(i)

for j in filelist:

for i in filelist2:

if j== i :

result2.append(j)

return render_template(‘index.html’,result2=result2)

前端代码展现:

search

result

{% for line2 in result2 %}

{

{ line2}}

{% endfor %}