姓名 | 性别 | 学号 |
---|---|---|
张佳静 | 女 | 0194966 |
import re
import requests
import pandas as pd
import openpyxl
import time
import os
import matplotlib.pyplot as plt
import numpy as np
import fitz
xlsx = '化学原料行业.xlsx'
df = pd.read_excel(xlsx)
exf = openpyxl.load_workbook(xlsx) #工作簿
sheet = exf.active #获取当前正在处理的表格
C2 = sheet['C2'] #excel定位
C = sheet['C']
links = [c.value for c in C]
links_1 = links[1:-1]
links_2 = ''.join(links_1) #转化成文本 可加‘\n’换行符
p = re.compile('"(.*?)","(.*?)"')
list_of_tuple = p.findall(links_2)
df2 = pd.DataFrame({'link':[t[0] for t in list_of_tuple],
'f_name':[t[1] for t in list_of_tuple]})
df2.to_csv('化学原料行业.csv')
f = open('化学原料行业.csv',encoding='utf-8')
df1 = pd.read_csv(f)
links = df1['link']; f_names = df1['f_name']
# f_names = [re.sub('_','',f) for f in f_names] #删去
# f_names_2 = [re.sub('600\d\d\d','',f) for f in f_names]
def filter_links(words,df,include=True):
ls = []
for word in words:
if include:
ls.append([word in f for f in df.f_name])
else:
ls.append([word not in f for f in df.f_name])
index = []
for r in range(len(df)):
flag = not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df2 = df[index]
return(df2)
df_all = filter_links(['摘要','问询函','社会责任'],df2,include=False)
df_updt = filter_links(['取消'],df_all,include=False)
df_orig = filter_links(['(','('], df_all,include=False)
df_updt = filter_links(['(','('],df_all,include=True)
df_updt = filter_links(['取消'],df_updt,include=False)
def sub_with_update(df_updt,df_orig): # 定义函数
df_newest = df_orig.copy() # 先对年报进行复制
index_orig = []
index_updt = []
for i,f in enumerate(df_orig.f_name):
for j,fn in enumerate(df_updt.f_name):
if f in fn:
index_orig.append(i)
index_updt.append(j)
#return((index_orig,index_updt))
for n in range(len(index_orig)):
i = index_orig[n]
j = index_updt[n]
df_newest.iloc[i,-2] = df_updt.iloc[j,-2]
#df_newest.iloc[i,-1] = df_updt.iloc[j,-1]
return(df_newest)
df_newest = sub_with_update(df_updt,df_orig)
print(df_newest)
df_newest.sort_values(by=['f_name'],inplace=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]
counts = df_newest['公司简称'].value_counts()
ten_company = []
for cn in counts.index[:10]:
ten_company.append(filter_links([cn], df_newest))
if not os.path.exists('10companies'): # 创建文件夹
os.makedirs('10companies')
for df_com in ten_company:
cn = df_com['公司简称'].iloc[0]
df_com.to_csv('10companies/%s.csv' % cn)
ten_csv = os.listdir('10companies')
print(ten_csv)
def extract_links(t):
f = open(t,encoding='utf-8')
df_final_links = pd.read_csv(f)
links = df_final_links['link']
return(links)
links_company = []
for i in range(len(ten_csv)): #提取年报数目前10的公司的年报链接
links_single = extract_links(ten_csv[i])
links_single = list(links_single)
links_company.append(links_single)
def get_PDF_url(url):
r=requests.get(url);r.encoding='utf=8';html=r.text
r.close()
p=re.compile('<a href=(.*?)\s.*?>(.*?)</a>',re.DOTALL)
a=p.search(html) #因第一个<a>即是目标标签,故用search
if a is None:
Warning('没有找到下载链接。请手动检查链接:%s'%url)
return()
else:
href=a.group(1); fname=a.group(2).strip()
href=r.url[:26]+href #形成完整链接
return((href,fname))
for i in range(len(links_company)):
for link in links_company[i]:
href,fname = get_PDF_url(link)
r = requests.get(href,allow_redirects=True)
open('%s' %fname,'wb').write(r.content) #'wb'二进制
r.close()
filenames = os.listdir() #把代码所在文件夹的所有文件和文件名查找出来
pdf_list = [f for f in filenames if f.endswith('.pdf')] #将年报查找出来
def extract_data(pdf): #定义抓取营业收入数据函数
idx = pdf.find('20')
company_name = pdf[0:idx]
year = pdf[idx:idx+4]
#
doc = fitz.open(pdf)
text = [page.get_text() for page in doc]
text = ''.join(text)
#
p_s = re.compile(r'(?<=\n)\w{1,2}、.*?会计数据和财务指标\s*?(?=\n)')
section_match = p_s.search(text)
s_idx = section_match.start()
#
p = re.compile('营业收入(.*?)归属于',re.DOTALL)
data_line = p.search(text[s_idx:]).group()
data_line = data_line.replace('\n', '')
p_digit = re.compile(r'(-)?\d[,0-9]*?\.\d{1,2}')
revenue = p_digit.search(data_line).group()
return((company_name,year,revenue))
#创建新DataFrame进行组合
companies, years, revenues = [],[],[]
for pdf in pdf_list:
company, year, revenue = extract_data(pdf)
companies.append(company)
years.append(year)
revenues.append(revenue)
df = pd.DataFrame({'company': companies,
'year': years,
'revenue': revenues})
df.sort_values(by=['year'],inplace=True) #将各年年报按年份排序
#统一年报公司名称
df['company'] = [re.sub('股份有限公司','',f) for f in df['company']]
df['company'] = [re.sub('60\d\d\d\d','',f) for f in df['company']]
df['company'] = [re.sub(':','',f) for f in df['company']]
df['revenue'] = [re.sub(',','',f) for f in df['revenue']] #除去revenue的逗号
df['revenue'] = pd.to_numeric(df['revenue']) #将revenue的object格式转化为float格式
print(df)
def image(d): #定义函数
d1=df[df['company']==d] #定位公司名称
fig=plt.figure()
revenue=d1['revenue'] #以营业收入为纵坐标
df['revenue'] = pd.to_numeric(df['revenue'])
year = d1['year'] #以年份为横坐标
ax1=plt.subplot(211)
ax1.plot(year,revenue,color='g')
ax1.set_ylabel('revenue')
ax1.set_title('revenue')
ax1.grid(True,axis='both')
image('ST南化')
据图分析,ST南化的营业收入在近四年呈增长型,平均收入规模为5亿左右
image('三孚股份')
据图分析,三孚股份近四年营业收入状况为:2019年达到四年最高点,为11.5亿;但其后一年的营业收入为四年最低(10亿)
image('新疆天业')
据图分析,新疆天业近四年营业收入状况为:2020年达到四年最高点,超过80亿
image('红星发展')
据图分析,红星发展的营业收入状况为:2018年达到四年最高点,为16亿;其后两年一直下下跌,最近一年(2020年)跌落14亿,但仍未低于四年最低点
image('振华股份')
据图分析,最近一年(2020年)振兴股份的营业收入大致位于四年平均线处,规模为13亿左右
image('丰元股份')
据图分析,四年内丰元股份的营业收入呈现出上下波动趋势,2018年为四年最低,2019年则增为最高
image('洪汇新材')
洪汇新材近四年营业收入持续增长,但增长率呈现出下跌趋势,最近一年(2020年)的收入规模为5.5亿
image('凯美特气')
凯美特气近四年营业收入持续增长,但增长率呈现出下跌趋势,最近一年(2020年)的收入规模为5亿
image('延安必康')
延安必康近三年的营业收入规模最高超过90亿,最低则达到70亿
image('中泰化学')
中泰化学近四年营业收入持续增长,但增长率呈现出下跌趋势,最近一年(2020年)的收入规模为800亿