import os
import re
import fitz # pip install pymupdf
import csv
os.chdir( '/Users/caizuguang/Desktop/finacedata')
filenames = os.listdir()
prefix = '公开发行证券的公司信息披露内容与格式准则第2号'
# 须点击上面列表中链接,下载四个修订版本PDF文件,
# 并放置在该代码文件所在文件夹。
# 筛选出格式准则文件
pdf = [f for f in filenames if f.startswith(prefix) and f.endswith('.pdf')]
# 提取修订年份
year = [f[-12:-5] for f in pdf]
# 提取PDF文本,因有一份无法被pdfplumber库提取,故采用pymupdf
def getText(pdf):
text = ''
doc = fitz.open(pdf)
for page in doc:
text += page.getText()
doc.close()
return(text)
def getSubtext(pdf):
text = getText(pdf)
p1 = re.compile('第二章\s*年度报告正文(.*)第三章\s*年度报告摘要',re.DOTALL)
subtext = p1.search(text).group(0)
return(subtext)
# 主要部分
def getTOC(pdf):#获取目录
subtext = getSubtext(pdf)
p = re.compile('(?<=\\n)(第\w{1,2}节)\s+(.*)(?=\\n)')
listOftuple = p.findall(subtext)
return(listOftuple)
# text_list = [getText(f) for f in pdf]
toc = [getTOC(f) for f in pdf]
def getTOC_content(pdf):
subtext = getSubtext(pdf)
p = re.compile('(?<=\\n)(第\w{1,2}节)\s+(.*)(?=\\n)')
index = []
for match in p.finditer(subtext):
s = match.start()
e = match.end()
index.append((s,e))
# print(index)
# print('########')
# print(index[1][1])
# # print(subtext[704:719])
# print('xxxxxxx')
#
content = []
for i in range(len(index)-1):
s = index[i][1]
e = index[i+1][0]
content.append(subtext[s:e])
last_start = index[-1][1]
last_end = subtext.rfind('\n第三章')
content.append(subtext[last_start:last_end])
return((index,content))
# index, content = getTOC_content(pdf[0])
list_of_content = [getTOC_content(f) for f in pdf]
def export_to_html(pdf,toc,content):
f = open('homework2_output_template.html', encoding='utf-8')
html = f.read()
f.close()
template = '<div><h3>%s</h3><p>%s</p></div>'
toc_list = [t[0]+' '+ t[1] for t in toc]
# for j in toc_list:
# print(j)
div = [template % (t,c) for (t,c) in zip(toc_list, content)]
div = ''.join(div)
html = html % div
f = open('homework2_%s.html' % pdf, 'w',encoding='utf-8')
f.write(html)
f.close()
return()
content_list = [c[1] for c in list_of_content]
#print(len(list_of_content))
for j in range(4):
export_to_html(pdf[j], toc[j], content_list[j])