python 提取pdf格式电子发票并改名

本人小公司企业主一枚，经常接受电子发票，然后有空发给会计，默认的发票基本都是发票号，看不出是哪个公司开的，哪个公司收的。

经过多次研究使用pymupdf读取pdf格式，但是不同省份和城市开出的电子发票细节格式并不相同，最终是按区域先读取再细分信息。

读取多数信息了，至于改名规则可以根据喜好修改。如果有相同的发票会在重复的文件前面加上下划线
_
便于删除

目前广东省，厦门市，浙江省开出的发票测试无误，本程序有一定的容错机制其他省份的大概率是没问题，除非格式特殊

ps：不支持图片及完全曲线化的pdf电子发票（无字体pdf）

python 3.7 及fitz pymupdf ver =1.18.14下运行

# _*_ coding:utf-8 _*_
# fitz pymupdf ver =1.18.14
# Python 3.7
# 2021-11-17
# by gingeer @ CSDN
# 按位置获取发票信息并改名
# pymupdf doc see https://pymupdf.readthedocs.io/en/latest/

## make a new pdf with same size of old one.
##doc = fitz.open("some.file")
##page = doc[0]
##paths = page.get_drawings()  # extract existing drawings
### this is a list of "paths", which can directly be drawn again using Shape
### -------------------------------------------------------------------------
###
### define some output page with the same dimensions
##outpdf = fitz.open()
##outpage = outpdf.new_page(width=page.rect.width, height=page.rect.height)
##shape = outpage.new_shape()  # make a drawing canvas for the output page

import sys
import fitz
import re
import os, math
import datetime
# pip install pyMuPDF not fitz!!!!!!!!!!!!!
def make_text(words):
    """Return textstring output of get_text("words").
    Word items are sorted for reading sequence left to right,
    top to bottom.
    """
    line_dict = {}  # key: vertical coordinate, value: list of words
    words.sort(key=lambda w: w[0])  # sort by horizontal coordinate
    for w in words:  # fill the line dictionary
        y1 = round(w[3], 1)  # bottom of a word: don't be too picky!
        word = w[4]  # the text of the word
        line = line_dict.get(y1, [])  # read current line content
        line.append(word)  # append new word
        line_dict[y1] = line  # write back to dict
    lines = list(line_dict.items())
    lines.sort()  # sort vertically
    lines_2=[" ".join(line[1]) for line in lines]
   
    return lines_2
##    return "\n".join(lines_2)  #原方法



pdfPath='./'
pagesfiles=[]
newnames=[]
files = os.listdir(pdfPath)

pdffiles = [f for f in files if f.lower().endswith('.pdf')]
##print(pdffiles)

#大区域按百分比划分
invoice_rect_per = [0.69,0.02,0.978,0.2]
buyer_rect_per = [0.16,0.214, 0.57,0.369]
seller_rect_per = [0.16,0.744, 0.57,0.884]
total_amounts_per =[0.666,0.692, 0.95,0.749] 

for pdffile in pdffiles:

    print(pdffile)
    doc = fitz.open(pdfPath+pdffile)
    page = doc[0]
##    print(page)
    x_end=page.rect[2]
    y_end=page.rect[3]
    shape=page.new_shape()
##    print(x_end,y_end,page.rect)
    shape.draw_rect(page.rect)
    words = page.get_text("words")  # list of words on page
    
    ##===debug===
##    for word in words:
##        print(word[0]/x_end,word[1]/y_end,word[2]/x_end,word[3]/y_end,word[4])
##        print(word)
##        text_point=fitz.Point(word[0],word[3])
##        shape.draw_circle(text_point,1)
##        shape.insertText(text_point,word[4],color=(1,0,0),fontsize=8)
    #===debug end===


    invoive_rect = fitz.Rect(invoice_rect_per[0] * x_end,invoice_rect_per[1] * y_end, invoice_rect_per[2] * x_end,invoice_rect_per[3] * y_end)
    buyer_rect = fitz.Rect(buyer_rect_per[0] * x_end,buyer_rect_per[1] * y_end,buyer_rect_per[2] * x_end,buyer_rect_per[3] * y_end)
    seller_rect = fitz.Rect(seller_rect_per[0] * x_end,seller_rect_per[1] * y_end,seller_rect_per[2] * x_end,seller_rect_per[3] * y_end)
    total_amounts_rect = fitz.Rect(total_amounts_per[0] * x_end,total_amounts_per[1] * y_end,total_amounts_per[2] * x_end,total_amounts_per[3] * y_end)
    
    shape.draw_rect(total_amounts_rect) ## check positions


    #-----获取重要信息-------

    #----发票信息
    mywords_1 = [w for w in words if fitz.Rect(w[:4]) in invoive_rect]
    mywords_2 = [w for w in words if fitz.Rect(w[:4]).intersects(invoive_rect)]
    
    invoice_infos = make_text(mywords_1)
    check_invoice=False
    for info in invoice_infos:
        if re.search('\d{12}',info) != None:
            invoice_cata = info
            check_invoice=True
        elif re.search('\d{8}',info) != None:
            invoice_num = info
            check_invoice=True
        elif re.search('(\d{4})\D+(\d{2})\D+(\d{2})',info) !=None:
            date_re_result=re.search('(\d{4})\D+(\d{2})\D+(\d{2})',info)
            invoice_date='-'.join([date_re_result.group(1),date_re_result.group(2),date_re_result.group(3)])
            check_invoice=True
    if not check_invoice:
        print('不是发票文件',invoice_infos)
            # ===debug===
##        shape.finish(width=0.5,color=(1,0,0))
##        shape.commit()
##        doc.save('xjj.pdf')
##        doc.close()
##        dpi=200
##        zoom_x = dpi/72
##        zoom_y = dpi/72
##        mat = fitz.Matrix(zoom_x, zoom_y).preRotate(0)
##        pix = page.getPixmap(matrix=mat, alpha=False)
##        pix.set_dpi(dpi,dpi)
##        pix.writePNG(pdffile + '.png')
    # ===debug end===
        continue
    else:
        invoice_num=invoice_num.replace('发票号码','')
        invoice_num=invoice_num.replace('：','')
        invoice_num=invoice_num.replace(':','')
        invoice_num=invoice_num.strip()

    #----购方
    buyer_words = [w for w in words if fitz.Rect(w[:4]) in buyer_rect]
    buyer_infos = make_text(buyer_words)
    for info in buyer_infos:
        if info.find('公司') >= 0:
            buyer=info
            break
        elif info =='个人':
            buyer='个人'

    #----卖方
    seller_words = [w for w in words if fitz.Rect(w[:4]) in seller_rect]
    seller_infos = make_text(seller_words)
    for info in seller_infos:
        if len(info) >3:
            seller=info
            break
                    


    #----合计金额
    total_amount_words = [w for w in words if fitz.Rect(w[:4]) in total_amounts_rect]
    total_amount_infos = make_text(total_amount_words)
    total_amount=total_amount_infos[-1]
    total_amount=total_amount.replace('¥','')
    total_amount=total_amount.replace('￥','')
    
    

    #----新名字    
    connector='_'
    newname_elements=[invoice_date,invoice_num,buyer+'(购)',seller,'￥'+total_amount]
    newname=connector.join(newname_elements) + '.pdf'

    


    
##    doc.save(pdffile)
    

        
    if newname == pdffile:
        print('发票名已经正规化！')
        newnames.append(newname)
        doc.close()
        print('------------------------')
        continue
        
    if newname not in newnames and newname not in pdffiles:
        print(f'原名：{pdffile}\n新名：{newname}')
        newnames.append(newname)
        doc.close()
        os.rename(pdffile,newname)
    else:
        
        while newname in newnames:
            newname = '_'+newname
    
        newnames.append(newname)
        doc.close()
        os.rename(pdffile,'_'+newname)
        print('重复文件名!已加前缀')
    
    print('------------------------')
    # ===debug===
##    shape.finish(width=0.5,color=(1,0,0))
##    shape.commit()
##    dpi=200
##    zoom_x = dpi/72
##    zoom_y = dpi/72
##    mat = fitz.Matrix(zoom_x, zoom_y).preRotate(0)
##    pix = page.getPixmap(matrix=mat, alpha=False)
##    pix.set_dpi(dpi,dpi)
##    pix.writePNG(pdffile + '.png')
    # ===debug end===
    
#    print(newname)

原文链接：https://blog.csdn.net/gingeers/article/details/126144445

你可能也喜欢