import requests
import pandas as pd
import openpyxl
import os
f = open('半导体行业.csv',encoding='utf-8') #../意味着这个csv在上一级目录
df = pd.read_csv(f)
def filter_links(words,df,include=True):
ls =[]
for word in words:
if include:
ls.append([word in f for f in df.f_name])
else:
ls.append([word not in f for f in df.f_name])
index = []
for r in range(len(df)):
flag = not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df2 = df[index]
return(df2)
df_all = filter_links(['摘要','问询函','社会责任'],df,include=False)
df_original = filter_links(['(','('],df_all)
df_orig = filter_links(['(','('], df_all,include=False)
df_updt = filter_links(['(','('],df_all,include=True)
df_updt = filter_links(['取消'],df_updt,include=False)
def sub_with_update(df_updt,df_orig):
df_newest = df_orig.copy()
index_orig = []
index_updt = []
for i,f in enumerate(df_orig.f_name):
for j,fn in enumerate(df_updt.f_name):
if f in fn:
index_orig.append(i)
index_updt.append(j)
#return((index_orig,index_updt))
for n in range(len(index_orig)):
i = index_orig[n]
j = index_updt[n]
df_newest.iloc[i,-2] = df_updt.iloc[j,-2]
#df_newest.iloc[i,-1] = df_updt.iloc[j,-1]
return(df_newest)
df_newest = sub_with_update(df_updt,df_orig)
#index_orig,index_updt = sub_with_update(df_updt, df_orig)
df_newest.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]
counts = df_newest['公司简称'].value_counts()
ten_company = []
for cn in counts.index[:10]:
ten_company.append(filter_links([cn], df_newest))
if not os.path.exists('10companies'):
os.makedirs('10companies')
for df_com in ten_company:
cn = df_com['公司简称'].iloc[0]
df_com.to_csv('10companies/%s.csv' % cn) #运用to_csv函数
ten_csv = os.listdir('10companies')
import re
import time
#names = [上海贝岭 华天科技 华微电子 南大光电 台基股份 士兰微 康强电子 有研新材 欧比特 纳思达 苏州固锝 通富微电 长电科技]
names = input().strip().split( ) #输入的内容中的各个元素将以空格隔开,所以我们把空格删除
names = [ './10companies/%s.csv' % name for name in names]
df_list = [pd.read_csv(name) for name in names]
上海贝岭 华天科技 华微电子 南大光电 台基股份 士兰微 康强电子 有研新材 欧比特 纳思达 苏州固锝 通富微电 长电科技
df = df_list[0]
links = df['link']; f_names = df['f_name']
links[0]
'http://news.windin.com/ns/bulletin.php?code=4A9E148612E8&id=84370262&type=1'
def get_PDF_url(url):
r = requests.get(url); r.encoding = 'utf-8'; html = r.text
r.close() #已提取html内容,结束connection
p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>',re.DOTALL)
a = p.search(html) #因为第一个<a>即为目标标签,故用search
if a is None:
Warning('没有找到下载链接。请手动检查链接:%s'%url)
return()
else:
href = a.group(1); fname = a.group(2).strip()
href = r.url[:26] + href #形成完整的链接
return((href,fname))
url = 'http://news.windin.com/ns/bulletin.php?code=4A9E148612E8&id=84370262&type=1'
get_PDF_url(url)
('http://news.windin.com/ns/getatt.php?id=84370262&att_id=24460617&code=4A9E148612E8', '600171: 上海贝岭2016年年度报告.pdf')
for df in df_list:
links = df['link']
f_names = df['f_name']
for link in links:
href,fname = get_PDF_url(link)
r = requests.get(href,allow_redirects=True) #向服务器发送请求下载链接。
open('%s' %fname,'wb').write(r.content) #wb:以二进制写入。content:读取二进制数据,自己查
time.sleep(10)
r.close()
import fitz
filenames = os.listdir() #把代码所在文件夹的所有文件和文件名查找出来
finalfix = '年年度报告'
pdf_list = [f for f in filenames if f.endswith('年年度报告.pdf')]
years = [f[-13:-9] for f in pdf_list]
pdf_list
['有研新材:2019年年度报告.pdf', '有研新材:2018年年度报告.pdf', '华天科技:2017年年度报告.pdf', '华天科技:2016年年度报告.pdf', '康强电子:2019年年度报告.pdf', '康强电子:2018年年度报告.pdf', '欧比特:2020年年度报告.pdf', '欧比特:2015年年度报告.pdf', '华微电子:2017年年度报告.pdf', '华微电子:2016年年度报告.pdf', '苏州固锝:2017年年度报告.pdf', '苏州固锝:2016年年度报告.pdf', '通富微电:2017年年度报告.pdf', '通富微电:2016年年度报告.pdf', '南大光电:2019年年度报告.pdf', '南大光电:2018年年度报告.pdf', '长电科技:2017年年度报告.pdf', '长电科技:2016年年度报告.pdf', '纳思达:2020年年度报告.pdf', '南大光电:2020年年度报告.pdf', '士兰微:2016年年度报告.pdf', '士兰微:2017年年度报告.pdf', '南大光电:2015年年度报告.pdf', '台基股份:2017年年度报告.pdf', '台基股份:2016年年度报告.pdf', '纳思达:2019年年度报告.pdf', '纳思达:2018年年度报告.pdf', '欧比特:2019年年度报告.pdf', '康强电子:2020年年度报告.pdf', '有研新材:2020年年度报告.pdf', '上海贝岭:2017年年度报告.pdf', '上海贝岭:2016年年度报告.pdf', '士兰微:2018年年度报告.pdf', '士兰微:2019年年度报告.pdf', '长电科技:2015年年度报告.pdf', '台基股份:2019年年度报告.pdf', '纳思达:2017年年度报告.pdf', '长电科技:2020年年度报告.pdf', '欧比特:2017年年度报告.pdf', '欧比特:2016年年度报告.pdf', '华微电子:2015年年度报告.pdf', '华天科技:2020年年度报告.pdf', '苏州固锝:2015年年度报告.pdf', '通富微电:2015年年度报告.pdf', '通富微电:2020年年度报告.pdf', '上海贝岭:2019年年度报告.pdf', '苏州固锝:2020年年度报告.pdf', '上海贝岭:2018年年度报告.pdf', '华天科技:2015年年度报告.pdf', '华微电子:2020年年度报告.pdf', '有研新材:2017年年度报告.pdf', '有研新材:2016年年度报告.pdf', '康强电子:2017年年度报告.pdf', '康强电子:2016年年度报告.pdf', '华天科技:2019年年度报告.pdf', '华天科技:2018年年度报告.pdf', '华微电子:2019年年度报告.pdf', '通富微电:2019年年度报告.pdf', '通富微电:2018年年度报告.pdf', '上海贝岭:2020年年度报告.pdf', '苏州固锝:2019年年度报告.pdf', '苏州固锝:2018年年度报告.pdf', '士兰微:2020年年度报告.pdf', '南大光电:2017年年度报告.pdf', '南大光电:2016年年度报告.pdf', '台基股份:2015年年度报告.pdf', '长电科技:2019年年度报告.pdf', '台基股份:2020年年度报告.pdf', '士兰微:2015年年度报告.pdf']
p = pdf_list[0]
doc = fitz.open('华天科技:2015年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text1 = page7.getText()
def extract_data(pdf):
idx = pdf.find(':') #设置index为:,index之前就是公司,之后4位就是年份。
company_name = pdf[0:idx]
year = pdf[idx+1:idx+5]
#
doc = fitz.open(pdf) #打开pdf
text = [page.get_text() for page in doc]
text = ''.join(text)
#
p_s = re.compile(r'(?<=\n)\w{1,2}、.*?会计数据和财务指标\s*?(?=\n)') #匹配第六小节的标题
section_match = p_s.search(text) #抓取第六小节的标题
s_idx = section_match.start() #定位第六小节的标题
#
p = re.compile('营业收入(.*?)归属于',re.DOTALL) #匹配年报中那3年的营业收入
data_line = p.search(text[s_idx:]).group() #group:文本形式。提取出文本形式的
data_line = data_line.replace('\n', '') #有些年报格式不标准,数字有了换行,所以把换行符替换掉。
p_digit = re.compile(r'(-)?\d[,0-9]*?\.\d{1,2}') #匹配内容中的数字,获取所有','和0-9的数字,直到小数点后2位为止。
revenue = p_digit.search(data_line).group() #搜寻data_line中的p_digit内容
revenue = revenue.replace(',','') #把revenue里的逗号去掉
return((company_name,year,revenue))
# company, year, revenue = extract_data(pdf_list[1]) 输出示例
companies, years, revenues = [],[],[]
for pdf in pdf_list:
company, year, revenue = extract_data(pdf)
companies.append(company)
years.append(year)
revenues.append(revenue)
df = pd.DataFrame({'company': companies,
'year': years,
'revenue': revenues})
df.sort_values('company',ignore_index=True, inplace=True)
df
company | year | revenue | |
---|---|---|---|
0 | 上海贝岭 | 2019 | 878629217.06 |
1 | 上海贝岭 | 2018 | 784344437.44 |
2 | 上海贝岭 | 2016 | 509093878.16 |
3 | 上海贝岭 | 2017 | 561873977.23 |
4 | 上海贝岭 | 2020 | 1332205745.67 |
... | ... | ... | ... |
64 | 长电科技 | 2019 | 23526279785.46 |
65 | 长电科技 | 2017 | 23855512379.95 |
66 | 长电科技 | 2020 | 26463994512.61 |
67 | 长电科技 | 2016 | 19154527743.10 |
68 | 长电科技 | 2015 | 10807023798.60 |
69 rows × 3 columns
import numpy as np
from matplotlib import pyplot as plt
df_shbl = df[0:5]
df_shbl
company | year | revenue | |
---|---|---|---|
0 | 上海贝岭 | 2019 | 878629217.06 |
1 | 上海贝岭 | 2018 | 784344437.44 |
2 | 上海贝岭 | 2016 | 509093878.16 |
3 | 上海贝岭 | 2017 | 561873977.23 |
4 | 上海贝岭 | 2020 | 1332205745.67 |
df2_shbl = df_shbl.sort_values('year',ignore_index=True)
y_shbl = df2_shbl['year']
y_shbl
0 2016 1 2017 2 2018 3 2019 4 2020 Name: year, dtype: object
r_shbl = df2_shbl['revenue']
r_shbl
0 509093878.16 1 561873977.23 2 784344437.44 3 878629217.06 4 1332205745.67 Name: revenue, dtype: object
x = [2016,2017,2018,2019,2020]
y = [509093878.16 , 561873977.23,784344437.44,878629217.06,1332205745.67]
plt.title('revenue changing during 2016-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_htkj = df[5:11]
df2_htkj = df_htkj.sort_values('year',ignore_index=True)
y_htkj = df2_htkj['year']
r_htkj = df2_htkj['revenue']
r_htkj
0 3874017127.37 1 5475027849.36 2 7009887112.79 3 7121706261.65 4 8103490628.12 5 8382084225.00 Name: revenue, dtype: object
x = [2015,2016,2017,2018,2019,2020]
y = [3874017127.37,5475027849.36,7009887112.79,7121706261.65,8103490628.12,8382084225.00]
plt.title('revenue changing during 2015-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_hwdz = df[11:16]
df2_hwdz = df_hwdz.sort_values('year',ignore_index=True)
y_hwdz = df2_hwdz['year']
r_hwdz = df2_hwdz['revenue']
r_hwdz
0 1300659652.87 1 1395863516.46 2 1634890299.33 3 1656485627.44 4 1718583578.88 Name: revenue, dtype: object
x = [2016,2017,2018,2019,2020]
y = [1300659652.87,1395863516.46,1634890299.33,1656485627.44,1718583578.88]
plt.title('revenue changing during 2016-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_ndgd = df[16:22]
df2_ndgd = df_ndgd.sort_values('year',ignore_index=True)
y_ndgd = df2_ndgd['year']
r_ndgd = df2_ndgd['revenue']
r_ndgd
0 120372270.01 1 101325460.75 2 177213496.48 3 228174901.45 4 321375774.07 5 594958532.20 Name: revenue, dtype: object
x = [2015,2016,2017,2018,2019,2020]
y = [120372270.01,101325460.75,177213496.48,228174901.45,321375774.07,594958532.20]
plt.title('revenue changing during 2015-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_tjgf = df[22:27]
df2_tjgf = df_tjgf.sort_values('year',ignore_index=True)
y_tjgf = df2_tjgf['year']
r_tjgf = df2_tjgf['revenue']
r_tjgf
0 166154459.67 1 242091866.05 2 278651806.10 3 264937838.14 4 388244929.38 Name: revenue, dtype: object
x = [2016,2017,2018,2019,2020]
y = [166154459.67,242091866.05,278651806.10,264937838.14,388244929.38]
plt.title('revenue changing during 2016-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_slw = df[27:33]
df2_slw = df_slw.sort_values('year',ignore_index=True)
y_slw = df2_slw['year']
r_slw = df2_slw['revenue']
r_slw
0 1926414794.50 1 2375053756.57 2 2741791759.44 3 3025857115.44 4 3110573827.93 5 4280561779.48 Name: revenue, dtype: object
x = [2015,2016,2017,2018,2019,2020]
y = [1926414794.50,2375053756.57,2741791759.44,3025857115.44,3110573827.93,4280561779.48]
plt.title('revenue changing during 2015-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_kqdz = df[33:38]
df2_kqdz = df_kqdz.sort_values('year',ignore_index=True)
y_kqdz = df2_kqdz['year']
r_kqdz = df2_kqdz['revenue']
r_kqdz
0 1196754513.55 1 1303618061.94 2 1482897034.30 3 1418269645.18 4 1548632508.79 Name: revenue, dtype: object
x = [2016,2017,2018,2019,2020]
y = [1196754513.55,1303618061.94,1482897034.30,1418269645.18,1548632508.79]
plt.title('revenue changing during 2016-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_yyxc = df[38:43]
df2_yyxc = df_yyxc.sort_values('year',ignore_index=True)
y_yyxc = df2_yyxc['year']
r_yyxc = df2_yyxc['revenue']
r_yyxc
0 3807978100.03 1 4079620628.94 2 4767907571.39 3 10452454056.87 4 12969038953.96 Name: revenue, dtype: object
x = [2016,2017,2018,2019,2020]
y = [3807978100.03,4079620628.94,4767907571.39,10452454056.87,12969038953.96]
plt.title('revenue changing during 2016-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_obt = df[43:48]
df2_obt = df_obt.sort_values('year',ignore_index=True)
y_obt = df2_obt['year']
r_obt = df2_obt['revenue']
r_obt
0 388817482.85 1 559936657.37 2 738851449.89 3 851703526.46 4 869831525.64 Name: revenue, dtype: object
x = [2016,2017,2018,2019,2020]
y = [388817482.85,559936657.37,738851449.89,851703526.46,869831525.64]
plt.title('revenue changing during 2016-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_nsd = df[48:52]
df2_nsd = df_nsd.sort_values('year',ignore_index=True)
y_nsd = df2_nsd['year']
r_nsd = df2_nsd['revenue']
r_nsd
0 21323938529.08 1 21926472338.76 2 23295845261.03 3 19585185042.24 Name: revenue, dtype: object
x = [2017,2018,2019,2020]
y = [21323938529.08,21926472338.76,23295845261.03,19585185042.24]
plt.title('revenue changing during 2017-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_szgd = df[52:58]
df2_szgd = df_szgd.sort_values('year',ignore_index=True)
y_szgd = df2_szgd['year']
r_szgd = df2_szgd['revenue']
r_szgd
0 811945945.86 1 1187334429.10 2 1854591461.11 3 1885325487.13 4 1980553309.06 5 1804661200.03 Name: revenue, dtype: object
x = [2015,2016,2017,2018,2019,2020]
y = [811945945.86,1187334429.10,1854591461.11,1885325487.13,1980553309.06,1804661200.03]
plt.title('revenue changing during 2015-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_tfwd = df[58:64]
df2_tfwd = df_tfwd.sort_values('year',ignore_index=True)
y_tfwd = df2_tfwd['year']
r_tfwd = df2_tfwd['revenue']
r_tfwd
0 2321903112.69 1 4591656651.56 2 6519255165.45 3 7222862993.75 4 8266574620.47 5 10768700029.40 Name: revenue, dtype: object
x = [2015,2016,2017,2018,2019,2020]
y = [2321903112.69,4591656651.56,6519255165.45,7222862993.75,8266574620.47,10768700029.40]
plt.title('revenue changing during 2015-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()
df_cdkj = df[64:69]
df2_cdkj = df_cdkj.sort_values('year',ignore_index=True)
y_cdkj = df2_cdkj['year']
r_cdkj = df2_cdkj['revenue']
r_cdkj
0 10807023798.60 1 19154527743.10 2 23855512379.95 3 23526279785.46 4 26463994512.61 Name: revenue, dtype: object
x = [2016,2017,2018,2019,2020]
y = [10807023798.60,19154527743.10,23855512379.95,23526279785.46,26463994512.61]
plt.title('revenue changing during 2016-2020')
plt.xlabel("year")
plt.ylabel("revenue")
plt.plot(x,y)
plt.show()