学号:4201370
import re
import pandas as pd
import openpyxl
xlsx ='非银行金融行业.xlsx'
df = pd.read_excel(xlsx)
exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']
links = [c.value for c in C]
links_1=links[1:-1]
links_2=''.join(links_1)
sample='=HYPERLINK("http://news.windin.com/ns/bulletin.php?code=EAD42C52AF0A&id=123788902&type=1","经纬纺机:2020年年度报告(更新后))'
p=re.compile('"(.*?)","(.*?)"')
list_of_tuple=p.findall(links_2)
df2=pd.DataFrame({'Link':[t[0]for t in list_of_tuple],
'f_name':[t[1]for t in list_of_tuple]})
df2.to_csv('非银行金融行业.csv')
import requests
import re
import pandas as pd
import os
import time
f = open('非银行金融行业.csv',encoding='utf-8').read()
df = pd.read_csv(f)
p = re.compile('(?<=\d{4}(年度))')
f_names = [p.sub('年年度报告', f) for f in df.f_name]
df['f_name'] = f_names; del p,f_names
def filter_links(words,df,include=True):
ls = []
for word in words:
if include:
ls.append([word in f for f in df.f_name])
else:
ls.append([word not in f for f in df.f_name])
index = []
for r in range(len(df)):
flag = not include
for c in range(len(words)):
if include:
ls.append([word not in f for f in df.f_name])
index=[]
for r in range(len(df)):
flag=not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df2=df[index]
return(df2)
df_all = filter_links(['摘要'],df,include=[False])
df_orig = filter_links(['(','('],df_all,include=[False])
df_updt = filter_links(['(','('],df_all,include=[True])
df_updt = filter_links(['取消'],df_updt,include=[False])
def sub_with_update(df_updt,df_orig):
df_newest = df_orig.copy()
index_orig=[]
index_updt=[]
for i,f in enumerate(df_orig.f_name):
for j,fn in enumerate(df_updt.f_name):
if f in fn:
index_orig.append(i)
index_updt.append(j)
for n in range(len(index_orig)):
i = index_orig[n]
j = index_updt[n]
df_orig.iloc[i,-2] = df_updt.iloc[j,-2]
return(df_newest)
df_newest = sub_with_update(df_updt,df_orig)
df_all.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]
counts = df_newest['公司简称'].value_counts()
ten_company = []
for cn in counts.index[:10]:
ten_company.append(filter_links([cn],df_newest))
if not os.path.exists('10companies'):
os.makedirs('10companies')
for df_com in ten_company:
cn=df_com['公司简称'].iloc[0]
df_com.to_csv('10companies/%s.csv' % cn)
ten_csv=os.listdir('10companies')
import re
import pandas as pd
import os
os.chdir('/C:/Users/lenovo/报告/10companies')
fil=os.listdir()
fil.remove(fil[2])
links= []
f_names=[]
for dfil in fil:
m= open(dfil,encoding='utf-8')
fmo = pd.read_csv(m)
for link in fmo['link']:
links.append(link)
for f_name in fmo['f_name']:
f_names.append(f_name)
def get_PDF_url(url):
r = requests.get(url);r.encoding = 'utf-8'; html = r.text
r.close()
p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>', re.DOTALL)
a = p.search(html)
if a is None:
Warning('没有找到下载链接。请手动检查链接:%s' % url)
return()
else:
href = a.group(1); fname = a.group(2).strip()
href = r.url[:26] + href
return((href,fname))
hrefs=[];fnames=[]
for link in links:
href,fname = get_PDF_url(link)
hrefs.append(href)
fnames.append(fname)
time.sleep(10)
df_final_links=pd.DataFrame({'href':hrefs,'fname':fnames})
df_final_links.to_csv('final_links非银行金融行业.csv')
m1= open('final_links非银行金融行业.csv',encoding='gbk')
df_final_links=pd.read_csv(m1)
f_names=df_final_links['f_name']
hrefs=df_final_links['href']
for i in range(len(hrefs)):
href=hrefs[i];f_name=f_names[i]
r = requests.get(href, allow_redirects=True)
open('%s' %f_name, 'wb').write(r.content)
time.sleep(10)
r.close()
import re
import fitz
import os
filenames = os.listdir()
finalfix = '年年度报告摘要'
pdf_list = [f for f in filenames if f.endswith('年年度报告摘要.pdf')]
years = [f[-13:-9] for f in pdf_list]
pdf_list
def extract_data(pdf):
idx = pdf.find(':')
company_name = pdf[0:idx]
year = pdf[idx+1:idx+5]
#
doc = fitz.open(pdf) #打开pdf
text = [page.get_text() for page in doc]
text = ''.join(text)
#
p_s = re.compile(r'(?<=\n)\w{1,2}、.*?会计数据和财务指标\s*?(?=\n)')
section_match = p_s.search(text)
s_idx = section_match.start()
p = re.compile('营业收入(.*?)归属于',re.DOTALL)
data_line = p.search(text[s_idx:]).group()
data_line = data_line.replace('\n', '')
p_digit = re.compile(r'(-)?\d[,0-9]*?\.\d{1,2}')
revenue = p_digit.search(data_line).group()
revenue = revenue.replace(',','')
return((company_name,year,revenue))
companies, years, revenues = [],[],[]
for pdf in pdf_list:
company, year, revenue = extract_data(pdf)
companies.append(company)
years.append(year)
revenues.append(revenue)
df.sort_values('company',ignore_index=True, inplace=True)
df
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['font.family'] = 'simhei'
df = pd.read_excel("C://Users//lenovo//报告//10companies.xlsx")
print(df)
plt.plot(df["年份"],df["熊猫金控"],label='熊猫金控',linewidth=1,color='r',markersize=12)
plt.plot(df["年份"],df["九鼎投资"],label='九鼎投资',linewidth=1,color='y',markersize=12)
plt.plot(df["年份"],df["新力金融"],label='新力金融',linewidth=1,color='blue',markersize=12)
plt.plot(df["年份"],df["中银国际证券"],label='中银国际证券',linewidth=1,color='purple',markersize=12)
plt.plot(df["年份"],df["瑞达期货"],label='瑞达期货',linewidth=1,color='green',markersize=12)
plt.plot(df["年份"],df["华鑫股份"],label='华鑫股份',linewidth=1,color='brown',markersize=12)
plt.plot(df["年份"],df["江苏国信"],label='江苏国信',linewidth=1,color='black',markersize=12)
plt.plot(df["年份"],df["国联证券"],label='国联证券',linewidth=1,color='pink',markersize=12)
plt.plot(df["年份"],df["中金公司"],label='中金公司',linewidth=1,color='grey',markersize=12)
plt.plot(df["年份"],df["中泰证券"],label='中泰证券',linewidth=1,color='violet',markersize=12)
plt.xlabel("年份")
plt.ylabel('营业总收入')
plt.title("营业收入走势图")
plt.legend()
plt.grid()
plt.show()