import pandas as pd
import openpyxl
import re
xlsx = '纺织与制造行业.xlsx'
df = pd.read_excel(xlsx)
exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']
links = [c.value for c in C]
link_1=links[1:-1]
link_2=''.join(link_1)
p= re.compile('"(.*?)","(.*?)"')
list_of_tuple=p.findall(link_2)
df2=pd.DataFrame({'link':[t[0]for t in list_of_tuple],
'f_name':[t[1]for t in list_of_tuple]})
df2.to_csv('纺织与制造行业.csv')
import re
import pandas as pd
import os
df=pd.read_csv('纺织与制造行业.csv')
p=re.compile('(?<=\d{4})(年报)|(年年报)')
f_names=[p.sub(('年年度报'), f)for f in df.f_name]
df['f_name']=f_names;del p,f_names
def filter_links(words,df,include=True):
Is=[]
for word in words:
if include:
Is.append([word in f for f in df.f_name])
else:
Is.append([word not in f for f in df.f_name])
index=[]
for r in range(len(df)):
flag=not include
for c in range(len(words)):
if include:
flag = flag or Is[c][r]
else:
flag = flag and Is[c][r]
index.append(flag)
df2 = df[index]
return(df2)
df_all = filter_links(['摘要','询问函','社会责任','审计','财务','风险','债券'],df,include=False)
df_orig = filter_links(['(','('], df_all,include=False)
df_updt = filter_links(['(','('], df_all,include=True)
df_updt = filter_links(['取消'],df_updt,include=False)
def sub_with_update(df_updt,df_orig):
df_newest = df_orig.copy()
index_orig = []
index_updt = []
for i,f in enumerate(df_orig.f_name):
for j,fn in enumerate(df_updt.f_name):
if f in fn:
index_orig.append(i)
index_updt.append(j)
#return((index_orig,index_updt))
for n in range(len(index_orig)):
i = index_orig[n]
j = index_updt[n]
df_newest.iloc[i,-2] = df_updt.iloc[j,-2]
#df_newest.iloc[i,-1] = df_updt.iloc[j,-1]
return(df_newest)
df_newest = sub_with_update(df_updt,df_orig)
#index_orig,index_updt = sub_with_update(df_updt, df_orig)
df_newest.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]
counts = df_newest['公司简称'].value_counts()
ten_company = []
for cn in counts.index[:10]:
ten_company.append(filter_links([cn], df_newest))
if not os.path.exists('10companies'):
os.makedirs('10companies')
for df_com in ten_company:
cn = df_com['公司简称'].iloc[0]
df_com.to_csv('10companies/%s.csv' % cn)
ten_csv = os.listdir('10companies')
import re
import pandas as pd
names = input().strip().split( )
names = [ './10companies/%s.csv' % name for name in names]
df_list = [pd.read_csv(name) for name in names]
import fitz
import os
os.chdir(r"C:\Users\dell\Desktop\wanxinqi2201408\final project\10companies")
filenames = os.listdir() #把代码所在文件夹的所有文件和文件名查找出来
finalfix = '年年度报告'
pdf_list = [f for f in filenames if f.endswith('年年度报告.pdf')]
years = [f[-13:-9] for f in pdf_list]
pdf_list
import fitz
import os
import re
import pandas as pd
def extract_data(pdf):
idx = pdf.find(':')
company_name = pdf[0:idx]
year = pdf[idx+1:idx+5]
doc = fitz.open(pdf)
text = [page.get_text() for page in doc]
text = ''.join(text)
p_s = re.compile(r'(?<=\n)\w{1,2}、.*?会计数据和财务指标\s*?(?=\n)')
section_match = p_s.search(text)
s_idx = section_match.start()
p = re.compile('营业收入(.*?)归属于',re.DOTALL)
data_line = p.search(text[s_idx:]).group()
data_line = data_line.replace('\n', '')
p_digit = re.compile(r'(-)?\d[,0-9]*?\.\d{1,2}')
revenue = p_digit.search(data_line).group()
revenue = revenue.replace(',','')
return((company_name,year,revenue))
companies, years, revenues = [],[],[]
for pdf in pdf_list:
company, year, revenue = extract_data(pdf)
companies.append(company)
years.append(year)
revenues.append(revenue)
df = pd.DataFrame({'company': companies,
'year': years,
'revenue': revenues})
df.sort_values('company',ignore_index=True, inplace=True)
df
company year revenue 0 2016-04-22-002486.SZ-嘉麟杰 2015 700825174.66 1 2017-03-20-002486.SZ-嘉麟杰 2016 727895014.30 2 2018-04-17-002486.SZ-嘉麟杰 2017 883285616.26 3 2019-03-28-002486.SZ-嘉麟杰 2018 879139974.80 4 2020-04-29-002486.SZ-嘉麟杰 2019 1005356349.08 5 2021-04-30-002486.SZ-嘉麟杰 2020 1171293263.40 6 600107 美尔雅2018年度报告 470953799.24 7 600107湖北美尔雅股份有限公司2020年年度报告.pd 6001 338918564.50 8 600107美尔雅2016年年度报告.pd 6001 434337009.00 9 600107美尔雅2017年年度报告.pd 6001 443660878.19 10 600107美尔雅2019年年度报告.pd 6001 447250852.08 11 600156华升股份2016年年度报告.pd 6001 593146428.82 12 600156华升股份2018年年度报告.pd 6001 1016859661.35 13 600156华升股份2019年年度报告.pd 6001 103824.45 14 600156华升股份2020年年度报告.pd 6001 91353.87 15 600220江苏阳光2016年年度报告.pd 6002 2092172271.84 16 600220江苏阳光2019年年度报告.pd 6002 2349026226.65 17 600220江苏阳光股份有限公司2020年年度报告.pd 6002 1972351946.25 18 600689上海三毛2017年年度报告.pd 6006 1277461194.16 19 600689上海三毛2018年年度报告.pd 6006 1378099486.15 20 600689上海三毛2019年年度报告.pd 6006 1369543327.65 21 600689上海三毛企业(集团)股份有限公司2020年年度报告.pd 6006 945417541.82 22 孚日股份 2020 4432082348.81 23 孚日股份 2018 5170527632.84 24 孚日股份 2016 4374976362.28 25 孚日股份 2017 4821675351.22 26 新野纺织 2015 3046708978.24 27 新野纺织 2016 4085467260.70 28 新野纺织 2017 5195202111.76 29 新野纺织 2018 6059889289.10 30 新野纺织 2019 5732043894.74 31 新野纺织 2020 4865937958.66 32 棒杰股份 2020 625631302.42 33 棒杰股份 34 棒杰股份 2018 415512906.25 35 棒杰股份 2017 434094913.54 36 棒杰股份 2016 389873815.96 37 浔兴股份 2015 1041456861.05 38 浔兴股份 2016 1175490158.10 39 浔兴股份 2017 1859901688.90 40 浔兴股份 2018 2272495886.73 41 浔兴股份 2019 1919149402.77 42 浔兴股份 2020 1589919387.95 43 联发股份 2018 4168693635.91 44 联发股份 2017 4003621033.77 45 联发股份 2019 3910527325.36 46 联发股份 2015 3476970839.03 47 联发股份 2016 3738610953.91 48 联发股份 2020 3873259434.39
import numpy as np
from matplotlib import pyplot as plt
df_jlj = df[0:6]
df_jlj
df2_jlj = df_jlj.sort_values('year',ignore_index=True)
y_jlj = df2_jlj['year']
y_jlj
r_jlj = df2_jlj['revenue']
r_jlj
0 700825174.66 1 727895014.30 2 883285616.26 3 879139974.80 4 1005356349.08 5 1171293263.40 Name: revenue, dtype: object
x = [2015,2016,2017,2018,2019,2020]
y = [700825174.66 , 727895014.30,883285616.26, 879139974.80,1005356349.08,1171293263.40]
plt.title('revenue changing during 2016-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_mry = df[6:12]
df2_mry = df_mry.sort_values('year',ignore_index=True)
y_mry = df2_mry['year']
r_mry = df2_mry['revenue']
r_mry
0 470953799.24 1 338918564.50 2 434337009.00 3 443660878.19 4 447250852.08 5 593146428.82 Name: revenue, dtype: object
x = [2015,2016,2017,2018,2019,2020]
y = [470953799.24,338918564.50,434337009.00,443660878.19,447250852.08,593146428.82]
plt.title('revenue changing during 2015-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_hs = df[13:18]
df2_hs = df_hs.sort_values('year',ignore_index=True)
y_hs = df2_hs['year']
r_hs = df2_hs['revenue']
r_hs
0 103824.45 1 91353.87 2 2092172271.84 3 2349026226.65 4 1972351946.25 Name: revenue, dtype: object
x = [2016,2017,2018,2019,2020]
y = [ 103824.45,91353.87,2092172271.84,2349026226.65,1972351946.25]
plt.title('revenue changing during 2015-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_jsyg = df[18:23]
df2_jsyg = df_jsyg.sort_values('year',ignore_index=True)
y_jsyg = df2_jsyg['year']
r_jsyg = df2_jsyg['revenue']
r_jsyg
0 4432082348.81 1 1277461194.16 2 1378099486.15 3 1369543327.65 4 945417541.82 Name: revenue, dtype: object
x = [2015,2016,2017,2019,2020]
y = [4432082348.81,1277461194.16,1378099486.15,1369543327.65,945417541.82]
plt.title('revenue changing during 2015-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_shsm = df[24:29]
df2_shsm = df_shsm.sort_values('year',ignore_index=True)
y_shsm = df2_shsm['year']
r_shsm = df2_shsm['revenue']
r_shsm
0 3046708978.24 1 4374976362.28 2 4085467260.70 3 4821675351.22 4 5195202111.76 Name: revenue, dtype: object
x = [2016,2017,2018,2019,2020]
y = [3046708978.24,4374976362.28,4085467260.70,4821675351.22,5195202111.76]
plt.title('revenue changing during 2015-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_fr = df[30:34]
df2_fr = df_fr.sort_values('year',ignore_index=True)
y_fr = df2_fr['year']
r_fr = df2_fr['revenue']
r_fr
0 5732043894.74 1 600896507.81 2 4865937958.66 3 625631302.42 Name: revenue, dtype: object
x = [2017,2018,2019,2020]
y = [5732043894.74,600896507.81,4865937958.66,625631302.42]
plt.title('revenue changing during 2017-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_xy = df[26:31]
df2_xy = df_xy.sort_values('year',ignore_index=True)
y_xy = df2_xy['year']
r_xy = df2_xy['revenue']
r_xy
0 1041456861.05 1 389873815.96 2 1175490158.10 3 434094913.54 4 1859901688.90 Name: revenue, dtype: object
x = [2015,2016,2017,2018,2019,2020]
y = [1041456861.05,389873815.96,1175490158.10,434094913.54,1859901688.90]
plt.title('revenue changing during 2015-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_hs = df[:36]
df2_hs = df_hs.sort_values('year',ignore_index=True)
y_hs = df2_hs['year']
r_hs = df2_hs['revenue']
r_hs
x = [2015,2016,2017,2018,2019,2020]
y = [470953799.24,338918564.50,434337009.00,443660878.19,447250852.08,593146428.82]
plt.title('revenue changing during 2015-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_xx = df[37:42]
df2_xx = df_xx.sort_values('year',ignore_index=True)
y_xx = df2_xx['year']
r_xx = df2_xx['revenue']
r_xx
x = [2015,2016,2017,2018,2019,2020]
y = [1041456861.05,1175490158.10,1859901688.90,2272495886.73,1919149402.77,1589919387.95]
plt.title('revenue changing during 2015-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_lf = df[43:48]
df2_lf = df_lf.sort_values('year',ignore_index=True)
y_lf = df2_lf['year']
r_lf = df2_lf['revenue']
r_lf
x = [2015,2016,2017,2018,2019,2020]
y = [3476970839.03,3738610953.91,4003621033.77,4168693635.91,3910527325.36,3873259434.39]
plt.title('revenue changing during 2015-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()