import pdfplumber
import os
import re
import fitz
import csv


filenames = os.listdir() #把代码所在文件夹的所有文件和文件名查找出来
prefix = '公开发行证券的公司信息披露内容与格式准则第2号'

#这是一个列表生成式。第一个f：保留f；for f：后面是一个for循环筛选，留下所有前缀为prefix和后缀为.pdf的文件。
pdf = [f for f in filenames if f.startswith(prefix) and f.endswith('.pdf')]
year = [f[-12:-5] for f in pdf]
#相当于：
#for f in filenames:
    #if f f.startswith(prefix) and f.endswith('.pdf'):
        #pdf.append(f)
#提取修订年份


def getText(pdf):
    text = ''
    doc = fitz.open(pdf)
    for page in doc:
        text += page.getText()
    doc.close()
    return(text)
#doc = fitz.open(pdf):
    #text = [page.getText() for page in doc]
    #text = ''.join(text)

def getSubtext(pdf):
    text = getText(pdf)
    p1 = re.compile('第二章\s*年度报告正文(.*)第三章\s*年度报告摘要',re.DOTALL)
    subtext = p1.search(text).group(0)
    return(subtext)

def getTOC(pdf):
    text = getText(pdf)
    p1 = re.compile('第二章\s*年度报告正文(.*)第三章\s*年度报告摘要',re.DOTALL)  #compile：正则表达式搜索的模版
    subtext = p1.search(text).group(0)
    p = re.compile('(?<=\\n)(第\w{1,2}节)\s+(.*)(?=\\n)') #(?=是一个固定的格式，后面加=\\n意为换行符。‘\w{1，2}‘意为中间有1个字或者两个字
    listOftuple = p.findall(subtext)
    return(listOftuple)


#text_list = [getText(f) for f in pdf]
toc = [getTOC(f) for f in pdf]

def getTOC_Content(pdf):
    subtext = getSubtext(pdf)
    p = re.compile('(?<=\\n)(第\w{1,2}节)\s+(.*)(?=\\n)')
    index = []
    for match in p.finditer(subtext):
        s = match.start()
        e = match.end()
        index.append((s,e))
    content = []
    for i in range(len(index)-1):
        s = index[i][1]
        e = index[i+1][0]
        content.append(subtext[s:e])
    last_start=index[-1][1]
    last_end=subtext.rfind('\n第三章')
    content.append(subtext[last_start:last_end])
    return((index,content))
list_of_content = [getTOC_Content(f) for f in pdf]
content_list = [c[1] for c in list_of_content]

names = [f for f in filenames if f.endswith('年度报告.html')]


def to_html(name,toc,content):    
    f = open(name,encoding='utf-8')
    html = f.read()
    f.close()
    template = '''
        <div>
            <h3>%s</h3>
            <p>%s</p>
        </div>
        '''
    toc_list = [t[0]+' '+t[1] for t in toc]
    div = [template % (t,c) for (t,c) in zip(toc_list,content)]
    div = ''.join(div)
    html = html % div
    f = open('index' + name % pdf,'w',encoding='utf-8')
    f.write(html)
    f.close()
    return()


for name,t,content in zip(names,toc,content_list):
    print(name)
    to_html(name,t,content)

def to_csv(year,toc):
    with open(year+'.csv',mode='w',newline='',encoding='utf-8')as f:
        writer=csv.writer(f)
        for t in toc:
            writer.writerow([t[0],t[1]])
            return
for i in range(len(year)):
    to_csv(year[i],toc[i])

2021年年度报告.html
2012年年度报告.html
2017年年度报告.html
2016年年度报告.html

导入包¶

匹配需要提取数据的文件¶

利用pymupdf提取pdf文本¶

导入文件和方程¶

生成新的html¶

最终结果：¶