import pandas as pd
import numpy as np
import openpyxl
import re
import requests
import time
import os
import fitz
xlsx = '半导体行业.xlsx'
df = pd.read_excel(xlsx)
exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']
links = [c.value for c in C]
links_1 = links[1:-1]
links_2 = ''.join(links_1)
p = re.compile('"(.*?)","(.*?)"')
list_of_tuple = p.findall(links_2)
df1 = pd.DataFrame({'Link': [t[0] for t in list_of_tuple],
'f_name': [t[1] for t in list_of_tuple]})
df1.to_excel('半导体行业_final.xlsx')
print(df1.head(10))
df2=pd.read_excel(r'C:\Users\pc\Desktop\梅峻滔\半导体行业_final.xlsx')
links=df2['Link'];f_names=df1['f_name']
p=re.compile('(?<=\d{4})(年报)|(年年报)')
f_names=[p.sub('年年度报告',f) for f in df2.f_name]
df2['f_name']=f_names;del p,f_names
print(df2)
def filter_links(words,df,include=True):
ls=[]
for word in words:
if include:
ls.append([word in f for f in df.f_name])
else:
ls.append([word not in f for f in df.f_name])
index=[]
for r in range(len(df)):
flag=not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df3 = df[index]
return(df3)
df_all=filter_links(['摘要','问询函','社会责任','审计','财务','风险','债券'],df1,include=False)
df_orig=filter_links(['(','(',],df_all,include=False)
df_updt=filter_links(['(','(',],df_all,include=True)
df_updt=filter_links(['取消'],df_updt,include=False)
print(df_orig,df_updt)
def sub_with_update(df_updt,df_orig):
index_orig=[];i=0
index_updt=[];j=0
for i,f in enumerate(df_orig):
for j,fn in enumerate(df_updt):
if f in fn:
index_orig.append(i)
index_updt.append(j)
for n in range(len(index_orig)):
i=index_orig[n]
j=index_updt[n]
df_orig.iloc[i,-2]=df_updt.iloc[j,-2]
return(df_orig)
df_newest=sub_with_update(df_updt,df_orig)
df_newest.sort_values(by=['f_name'],inplace=True,ignore_index=True)
print(df_newest)
df_newest['公司简称']=[f[:4] for f in df_newest.f_name]
counts=df_newest['公司简称'].value_counts()
ten_company=[]
for cn in counts.index[:10]:
ten_company.append(filter_links([cn],df_newest))
print(ten_company)
def get_PDF_url(url):
r=requests.get(url);r.encoding='utf-8';html=r.text
r.close()
p=re.compile('<a href=(.*?)\s.*?>(.*?)</a>',re.DOTALL)
a=p.search(html)
if a is None:
Warning('没有找到下载链接,请手动检查链接:%s' % url)
return()
else:
href=a.group(1);fname=a.group(2).strip()
href=r.url[:26]+href
return((href,fname))
for n in range(len(ten_company)):
df_One_Co = pd.DataFrame({'Link':[h for h in ten_company[n]['Link']],
'f_name':[f for f in ten_company[n]['f_name']]})
for i in range(len(df_One_Co)):
href,fname=get_PDF_url(df_One_Co.Link[i])
r=requests.get(href,allow_redirects=True)
open('%s'%fname,'wb').write(r.content)
time.sleep(10)
r.close()
filenames = os.listdir()
print(filenames)
for i in range(len(ten_company)):
prefix= ten_company[i].iloc[1,-1]
print(prefix)
pdf = [f for f in filenames if prefix[:3] in f and f.endswith('.pdf')]
print(pdf)
year = [p[-13:-9] for p in pdf]
print(year)
def getText(pdf):
text = ''
doc = fitz.open(pdf)
for page in doc:
text += page.getText()
doc.close()
return(text)
def parse_data_line(pdf):
text = getText(pdf)
p1 = re.compile('\w{1,2}、主要会计数据和财务指标(.*?)(?=\w{1,2}、)',re.DOTALL)
subtext = p1.search(text)
if subtext is None:
p1 = re.compile('(\w{1,2})\s*主要会计数据(.*?)(?=(\w{1,2})\s*主要财务指标)',re.DOTALL)
subtext = p1.search(text).group(0)
else:
subtext = p1.search(text).group(0)
subp='([0-9,.%\- ]*?)\n' and '([0-9,.%\- ]*?)\s'
psub='%s%s%s%s'%(subp,subp,subp,subp)
p=re.compile('(\D+\n)+%s'%psub)
lines=p.findall(subtext)
return(lines)
filenames = os.listdir()
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #确保显示中文
plt.rcParams['axes.unicode_minus'] = False #确保显示负数的参数设置
for i in range(5):
prefix= ten_company[-i].iloc[1,-1]
print(prefix)
pdf = [f for f in filenames if prefix[:3] in f and f.endswith('.pdf')]
year = [p[-13:-9] for p in pdf]
df_data=pd.DataFrame({'年份':year,
'年营业收入':''})
for y in range(len(pdf)):
lines=parse_data_line(pdf[y])
df_fnc=pd.DataFrame([l for l in lines],columns=['',year[y]+'年',str(eval(year[y])-1)+'年','本年比上年增减',str(eval(year[y])-2)+'年'])
#df_fnc.to_excel('%s%s财务数据.xls'%(prefix,year[y]))
s=df_fnc.iloc[0,1]
s=s.replace(',','')
df_data['年营业收入'][[y]]=eval(s)
print(df_data)
plt.figure()
plt.plot(df_data['年份'],df_data['年营业收入'],label=u'年营业收入',color='#FF8247')
for x,y in zip(df_data['年份'],df_data['年营业收入']):#显示bar数值
plt.text(x,y,'%.3e'%y,ha='center',va='bottom')
plt.xlabel(u'(年)',fontsize=13)
plt.ylabel(u'年营业收入(元)',fontsize=13,rotation=90)
plt.legend(loc='best')
plt.title(u'%s%s-%s年营业收入的可视化'%(prefix,str(year[0]),str(year[-1])),fontsize=13)
plt.yticks(range(0,10**10,10**9))
plt.show()