# -*- coding: utf-8 -*- """ Created on Thu Nov 11 18:50:39 2021 @author: 玦祎 """ import os import re import fitz # pip install pymupdf import csv filenames = os.listdir() prefix = '公开发行证券的公司信息披露内容与格式准则第2号' # 须点击上面列表中链接,下载四个修订版本PDF文件, # 并放置在该代码文件所在文件夹。 # 筛选出格式准则文件 pdf = [f for f in filenames if f.startswith(prefix) and f.endswith('.pdf')] # 提取修订年份 year = [f[-12:-5] for f in pdf] # for f in pdf: print(f) print(year) def getText(pdf): text = '' doc = fitz.open(pdf) for page in doc: text += page.getText() doc.close() return(text) def getSubtext(pdf): text = getText(pdf) p1 = re.compile('第二章\s*年度报告正文(.*)第三章\s*年度报告摘要', re.DOTALL) subtext = p1.search(text).group(0) return(subtext) def getTOC(pdf): subtext = getSubtext(pdf) p = re.compile('(?<=\\n)(第\w{1,2}节)\s+(.*)(?=\\n)') listOftuple = p.findall(subtext) return(listOftuple) # text_list = [getText(f) for f in pdf] toc = [getTOC(f) for f in pdf] for t in toc: for item in t: print(item) print('\n') def getTOC_content(pdf): subtext = getSubtext(pdf) p = re.compile('(?<=\\n)(第\w{1,2}节)\s+(.*)(?=\\n)') index = [] for match in p.finditer(subtext): s = match.start() e = match.end() index.append((s,e)) # content = [] for i in range(len(index)-1): s = index[i][1] e = index[i+1][0] content.append(subtext[s:e]) last_start = index[-1][1] last_end = subtext.rfind('\n第三章') content.append(subtext[last_start:last_end]) return((index,content)) # index, content = getTOC_content(pdf[0]) list_of_content = [getTOC_content(f) for f in pdf] def export_to_html(pdf,toc,content,year): f = open('homework3_output_template.html', encoding='utf-8') html = f.read() f.close() template = '

%s

%s

' toc_list = [t[0]+' '+ t[1] for t in toc] div = [template % (t,c) for (t,c) in zip(toc_list, content)] div = ''.join(div) html = html % div f = open('年报格式准则_%s.html' % year, 'w',encoding='utf-8') f.write(html) f.close() return() content_list = [c[1] for c in list_of_content] for i in range(len(pdf)): export_to_html(pdf[i], toc[i], content_list[i],year[i])