Python中常见的文件格式处理
1.读写EXCEL文件
python语言处理excel文件(xls、xlsx)时,一般会使用openyxl库和xlrd,xlwt库,这里推荐使用openyxl库进行处理,其处理的excel行数更大
1.1 EXCEL文件读取
from openpyxl import load_workbook
#读取的excel名称
excel = load_workbook("./test.xlsx")
sheet = excel.get_sheet_by_name("Sheet1") #读取excel的sheet表格
rows = sheet.max_row #获取行数
cols = sheet.max_column #获取列数
for row in range(1,rows+1):
id = sheet.cell(row = row,column=1).value #第一列
source = sheet.cell(row=row,column=2).value #第二列
target = sheet.cell(row=row,column=3).value #第三列
print(id,"->",target)
if __name__ == "__main__":
print("excel读取完毕")
1.2 EXCEL文件书写
#!/usr/bin/python
# -*- coding: UTF-8 -*-
#Author: chengjinpei
import openpyxl
def write_xlsx_file(output_file):
workbook = openpyxl.Workbook() #创建excel对象
worksheet = workbook.create_sheet("Sheet1") #创建excel中的sheet
i = 1
for i in range(1,101):
worksheet.cell(i,1,"flowid")
worksheet.cell(i,2,"context")
workbook.save(output_file) #保存文件
if __name__ == "__main__":
write_xlsx_file("test_output.xlsx")
1.3使用pandas读取excel文件
import pandas as pd
def read_file(filepath):
data_list = []
#使用pandas函数处理excel
excel_data = pd.read_excel(filepath, sheet_name="Sheet4")#按照sheet处理文件夹
for index, row in excel_data.iterrows():#遍历excel中的每行内容
data = {}
data["id"] = str(row["CASE_SERIAL"]).strip()
data["text"] = str(row["CASE_CONTENT"]).strip().replace("【】", "").replace("*","")
data["letertype"] = str(row["CASE_EX_ACCORD"]).strip()
data["address"] = str(row["RQSTAREA"]).strip()
data["time"] = str(row["TIME"]).strip()
data_list.append(data)
return data_list
2.txt文本的读写
python读取txt文本比较简单,直接使用open即可
input_file = "test.txt"
output_file = "test_out.txt"
fr = open(input_file,"r",encoding="utf-8")
fw = open(output_file,"w",encoding="utf-8")
lines = fr.readlines()
for line in lines:
print(line)
fw.write(line)
#fw.flush()
if __name__ == "__main__":
print("读写文件完毕")
3.json文本的读写
python读取json文本与txt比较相似,直接使用open即可
import json
import openpyxl
from openpyxl import load_workbook
workbook = openpyxl.Workbook() #创建excel对象
worksheet = workbook.create_sheet("Sheet1") #创建excel中的sheet
worksheet.cell(1,1,"secShortName")
worksheet.cell(1,2,"secFullName")
##读取json代码
with open("./company.json", "r", encoding="utf-8") as f:
content = json.load(f)
datas = content["data"]
i= 2
for data in datas:
secShortName = ""
secFullName = ""
secShortNameChg = ""
try:
secShortName = data["secShortName"]
except:
secShortName = ""
try:
secFullName = data["secFullName"]
except:
secFullName = ""
try:
secShortNameChg = data["secShortNameChg"]
except:
secShortNameChg = ""
short_name = secShortName+","+secShortNameChg
#写入excel中方便统计
worksheet.cell(i,1,short_name)
worksheet.cell(i,2,secFullName)
i+=1
workbook.save("company_new.xlsx"
#json文件写入
with open("./conmpany_new.json", "w", encoding="utf-8") as f:
json.dump(datas, f, indent=4, ensure_ascii=False)
4.批量读取文件夹下的文件
批量读取某个文件夹下的文件需要使用os库以及python中list,实现代码如下:
def read_file(dir_name):
filename_list = []
for item in os.listdir(dir_name):
filename_list.append(dir_name + item)
output_path = dir_name+"out/"
if not os.path.exists(output_path):
os.makedirs(output_path)
print(filename_list)#打印执行路径下所有文件
#遍历文件夹下所有文件,开始批量处理
for file in filename_list:
print(file)
output_file = file.replace(dir_name,output_path)
ann_add_crf(file,output_file)#批量处理文件
if __name__ == "__main__":
read_file("/opt/jpcheng4/tuomin/src/train/original_data/")
版权声明:本文为chengjinpei原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。