import requests
import pandas as pd
import openpyxl
import os


f = open('半导体行业.csv',encoding='utf-8')  #../意味着这个csv在上一级目录
df = pd.read_csv(f)

def filter_links(words,df,include=True):
    ls =[]
    for word in words:
        if include:
            ls.append([word in f for f in df.f_name])
        else:
            ls.append([word not in f for f in df.f_name])
    index = []
    for r in range(len(df)):
        flag = not include
        for c in range(len(words)):
            if include:
                flag = flag or ls[c][r]
            else:
                flag = flag and ls[c][r]
        index.append(flag)
    df2 = df[index]
    return(df2)


df_all = filter_links(['摘要','问询函','社会责任'],df,include=False)
df_original = filter_links(['（','('],df_all)

df_orig = filter_links(['（','('], df_all,include=False)
df_updt = filter_links(['（','('],df_all,include=True)
df_updt = filter_links(['取消'],df_updt,include=False)


def sub_with_update(df_updt,df_orig):
    df_newest = df_orig.copy()
    index_orig = []
    index_updt = []
    for i,f in enumerate(df_orig.f_name):
        for j,fn in enumerate(df_updt.f_name):
            if f in fn:  
                index_orig.append(i)
                index_updt.append(j)        
    #return((index_orig,index_updt))
    for n in range(len(index_orig)):
        i = index_orig[n]
        j = index_updt[n]
        df_newest.iloc[i,-2] = df_updt.iloc[j,-2]
        #df_newest.iloc[i,-1] = df_updt.iloc[j,-1]
    return(df_newest)

df_newest = sub_with_update(df_updt,df_orig)
#index_orig,index_updt = sub_with_update(df_updt, df_orig)
            
df_newest.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]

counts = df_newest['公司简称'].value_counts()


ten_company = []
for cn in counts.index[:10]:
    ten_company.append(filter_links([cn], df_newest))
    
if not os.path.exists('10companies'):
    os.makedirs('10companies')
    
for df_com in ten_company:
    cn = df_com['公司简称'].iloc[0]
    df_com.to_csv('10companies/%s.csv' % cn)  #运用to_csv函数
    
ten_csv = os.listdir('10companies')


import re
import time


#names = [上海贝岭 华天科技 华微电子 南大光电 台基股份 士兰微 康强电子 有研新材 欧比特 纳思达 苏州固锝 通富微电 长电科技]
names = input().strip().split( )  #输入的内容中的各个元素将以空格隔开，所以我们把空格删除
names = [ './10companies/%s.csv' % name for name in names]
df_list = [pd.read_csv(name) for name in names]

上海贝岭 华天科技 华微电子 南大光电 台基股份 士兰微 康强电子 有研新材 欧比特 纳思达 苏州固锝 通富微电 长电科技


df = df_list[0]
links = df['link']; f_names = df['f_name']
links[0]

'http://news.windin.com/ns/bulletin.php?code=4A9E148612E8&id=84370262&type=1'


def get_PDF_url(url):
    r = requests.get(url); r.encoding = 'utf-8'; html = r.text
    r.close() #已提取html内容，结束connection
    p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>',re.DOTALL)
    a = p.search(html) #因为第一个<a>即为目标标签，故用search
    if a is None:
        Warning('没有找到下载链接。请手动检查链接：%s'%url)
        return()
    else:
        href = a.group(1); fname = a.group(2).strip()
    href = r.url[:26] + href #形成完整的链接
    return((href,fname))

url = 'http://news.windin.com/ns/bulletin.php?code=4A9E148612E8&id=84370262&type=1'
get_PDF_url(url)

('http://news.windin.com/ns/getatt.php?id=84370262&att_id=24460617&code=4A9E148612E8',
 '600171： 上海贝岭2016年年度报告.pdf')


for df in df_list:
    links = df['link']
    f_names = df['f_name']
    for link in links:
        href,fname = get_PDF_url(link)
        r = requests.get(href,allow_redirects=True) #向服务器发送请求下载链接。
        open('%s' %fname,'wb').write(r.content) #wb:以二进制写入。content：读取二进制数据，自己查
        time.sleep(10)
r.close()


import fitz


filenames = os.listdir() #把代码所在文件夹的所有文件和文件名查找出来
finalfix = '年年度报告'

pdf_list = [f for f in filenames if f.endswith('年年度报告.pdf')]
years = [f[-13:-9] for f in pdf_list]
pdf_list

['有研新材：2019年年度报告.pdf',
 '有研新材：2018年年度报告.pdf',
 '华天科技：2017年年度报告.pdf',
 '华天科技：2016年年度报告.pdf',
 '康强电子：2019年年度报告.pdf',
 '康强电子：2018年年度报告.pdf',
 '欧比特：2020年年度报告.pdf',
 '欧比特：2015年年度报告.pdf',
 '华微电子：2017年年度报告.pdf',
 '华微电子：2016年年度报告.pdf',
 '苏州固锝：2017年年度报告.pdf',
 '苏州固锝：2016年年度报告.pdf',
 '通富微电：2017年年度报告.pdf',
 '通富微电：2016年年度报告.pdf',
 '南大光电：2019年年度报告.pdf',
 '南大光电：2018年年度报告.pdf',
 '长电科技：2017年年度报告.pdf',
 '长电科技：2016年年度报告.pdf',
 '纳思达：2020年年度报告.pdf',
 '南大光电：2020年年度报告.pdf',
 '士兰微：2016年年度报告.pdf',
 '士兰微：2017年年度报告.pdf',
 '南大光电：2015年年度报告.pdf',
 '台基股份：2017年年度报告.pdf',
 '台基股份：2016年年度报告.pdf',
 '纳思达：2019年年度报告.pdf',
 '纳思达：2018年年度报告.pdf',
 '欧比特：2019年年度报告.pdf',
 '康强电子：2020年年度报告.pdf',
 '有研新材：2020年年度报告.pdf',
 '上海贝岭：2017年年度报告.pdf',
 '上海贝岭：2016年年度报告.pdf',
 '士兰微：2018年年度报告.pdf',
 '士兰微：2019年年度报告.pdf',
 '长电科技：2015年年度报告.pdf',
 '台基股份：2019年年度报告.pdf',
 '纳思达：2017年年度报告.pdf',
 '长电科技：2020年年度报告.pdf',
 '欧比特：2017年年度报告.pdf',
 '欧比特：2016年年度报告.pdf',
 '华微电子：2015年年度报告.pdf',
 '华天科技：2020年年度报告.pdf',
 '苏州固锝：2015年年度报告.pdf',
 '通富微电：2015年年度报告.pdf',
 '通富微电：2020年年度报告.pdf',
 '上海贝岭：2019年年度报告.pdf',
 '苏州固锝：2020年年度报告.pdf',
 '上海贝岭：2018年年度报告.pdf',
 '华天科技：2015年年度报告.pdf',
 '华微电子：2020年年度报告.pdf',
 '有研新材：2017年年度报告.pdf',
 '有研新材：2016年年度报告.pdf',
 '康强电子：2017年年度报告.pdf',
 '康强电子：2016年年度报告.pdf',
 '华天科技：2019年年度报告.pdf',
 '华天科技：2018年年度报告.pdf',
 '华微电子：2019年年度报告.pdf',
 '通富微电：2019年年度报告.pdf',
 '通富微电：2018年年度报告.pdf',
 '上海贝岭：2020年年度报告.pdf',
 '苏州固锝：2019年年度报告.pdf',
 '苏州固锝：2018年年度报告.pdf',
 '士兰微：2020年年度报告.pdf',
 '南大光电：2017年年度报告.pdf',
 '南大光电：2016年年度报告.pdf',
 '台基股份：2015年年度报告.pdf',
 '长电科技：2019年年度报告.pdf',
 '台基股份：2020年年度报告.pdf',
 '士兰微：2015年年度报告.pdf']


def extract_data(pdf):
    idx = pdf.find('：')  #设置index为：，index之前就是公司，之后4位就是年份。
    company_name = pdf[0:idx]
    year = pdf[idx+1:idx+5]
    #
    doc = fitz.open(pdf)  #打开pdf
    text = [page.get_text() for page in doc]  
    text = ''.join(text)  
    #
    p_s = re.compile(r'(?<=\n)\w{1,2}、.*?会计数据和财务指标\s*?(?=\n)')  #匹配第六小节的标题
    section_match = p_s.search(text)  #抓取第六小节的标题
    s_idx = section_match.start()  #定位第六小节的标题
    #
    p = re.compile('营业收入(.*?)归属于',re.DOTALL) #匹配年报中那3年的营业收入
    data_line = p.search(text[s_idx:]).group() #group：文本形式。提取出文本形式的
    data_line = data_line.replace('\n', '') #有些年报格式不标准，数字有了换行，所以把换行符替换掉。
    p_digit = re.compile(r'(-)?\d[,0-9]*?\.\d{1,2}') #匹配内容中的数字，获取所有','和0-9的数字，直到小数点后2位为止。
    revenue = p_digit.search(data_line).group() #搜寻data_line中的p_digit内容
    revenue = revenue.replace(',','') #把revenue里的逗号去掉
    return((company_name,year,revenue))


# company, year, revenue = extract_data(pdf_list[1]) 输出示例

companies, years, revenues = [],[],[]
for pdf in pdf_list:
    company, year, revenue = extract_data(pdf)
    companies.append(company)
    years.append(year)
    revenues.append(revenue)


df = pd.DataFrame({'company': companies,
                  'year': years,
                  'revenue': revenues})


df.sort_values('company',ignore_index=True, inplace=True)
df


import numpy as np 
from matplotlib import pyplot as plt


df_shbl = df[0:5]
df_shbl


df2_shbl = df_shbl.sort_values('year',ignore_index=True)
y_shbl = df2_shbl['year']
y_shbl

0    2016
1    2017
2    2018
3    2019
4    2020
Name: year, dtype: object


r_shbl = df2_shbl['revenue']
r_shbl

0     509093878.16
1     561873977.23
2     784344437.44
3     878629217.06
4    1332205745.67
Name: revenue, dtype: object


x = [2016,2017,2018,2019,2020]
y = [509093878.16 , 561873977.23,784344437.44,878629217.06,1332205745.67]
plt.title('revenue changing during 2016-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()


df_htkj = df[5:11]
df2_htkj = df_htkj.sort_values('year',ignore_index=True)
y_htkj = df2_htkj['year']
r_htkj = df2_htkj['revenue']
r_htkj

0    3874017127.37
1    5475027849.36
2    7009887112.79
3    7121706261.65
4    8103490628.12
5    8382084225.00
Name: revenue, dtype: object


x = [2015,2016,2017,2018,2019,2020]
y = [3874017127.37,5475027849.36,7009887112.79,7121706261.65,8103490628.12,8382084225.00]
plt.title('revenue changing during 2015-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()


df_hwdz = df[11:16]
df2_hwdz = df_hwdz.sort_values('year',ignore_index=True)
y_hwdz = df2_hwdz['year']
r_hwdz = df2_hwdz['revenue']
r_hwdz

0    1300659652.87
1    1395863516.46
2    1634890299.33
3    1656485627.44
4    1718583578.88
Name: revenue, dtype: object


x = [2016,2017,2018,2019,2020]
y = [1300659652.87,1395863516.46,1634890299.33,1656485627.44,1718583578.88]
plt.title('revenue changing during 2016-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()


df_ndgd = df[16:22]
df2_ndgd = df_ndgd.sort_values('year',ignore_index=True)
y_ndgd = df2_ndgd['year']
r_ndgd = df2_ndgd['revenue']
r_ndgd

0    120372270.01
1    101325460.75
2    177213496.48
3    228174901.45
4    321375774.07
5    594958532.20
Name: revenue, dtype: object


x = [2015,2016,2017,2018,2019,2020]
y = [120372270.01,101325460.75,177213496.48,228174901.45,321375774.07,594958532.20]
plt.title('revenue changing during 2015-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()


df_tjgf = df[22:27]
df2_tjgf = df_tjgf.sort_values('year',ignore_index=True)
y_tjgf = df2_tjgf['year']
r_tjgf = df2_tjgf['revenue']
r_tjgf

0    166154459.67
1    242091866.05
2    278651806.10
3    264937838.14
4    388244929.38
Name: revenue, dtype: object


x = [2016,2017,2018,2019,2020]
y = [166154459.67,242091866.05,278651806.10,264937838.14,388244929.38]
plt.title('revenue changing during 2016-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()


df_slw = df[27:33]
df2_slw = df_slw.sort_values('year',ignore_index=True)
y_slw = df2_slw['year']
r_slw = df2_slw['revenue']
r_slw

0    1926414794.50
1    2375053756.57
2    2741791759.44
3    3025857115.44
4    3110573827.93
5    4280561779.48
Name: revenue, dtype: object


x = [2015,2016,2017,2018,2019,2020]
y = [1926414794.50,2375053756.57,2741791759.44,3025857115.44,3110573827.93,4280561779.48]
plt.title('revenue changing during 2015-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()


df_kqdz = df[33:38]
df2_kqdz = df_kqdz.sort_values('year',ignore_index=True)
y_kqdz = df2_kqdz['year']
r_kqdz = df2_kqdz['revenue']
r_kqdz

0    1196754513.55
1    1303618061.94
2    1482897034.30
3    1418269645.18
4    1548632508.79
Name: revenue, dtype: object


x = [2016,2017,2018,2019,2020]
y = [1196754513.55,1303618061.94,1482897034.30,1418269645.18,1548632508.79]
plt.title('revenue changing during 2016-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()


df_yyxc = df[38:43]
df2_yyxc = df_yyxc.sort_values('year',ignore_index=True)
y_yyxc = df2_yyxc['year']
r_yyxc = df2_yyxc['revenue']
r_yyxc

0     3807978100.03
1     4079620628.94
2     4767907571.39
3    10452454056.87
4    12969038953.96
Name: revenue, dtype: object


x = [2016,2017,2018,2019,2020]
y = [3807978100.03,4079620628.94,4767907571.39,10452454056.87,12969038953.96]
plt.title('revenue changing during 2016-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()


df_obt = df[43:48]
df2_obt = df_obt.sort_values('year',ignore_index=True)
y_obt = df2_obt['year']
r_obt = df2_obt['revenue']
r_obt

0    388817482.85
1    559936657.37
2    738851449.89
3    851703526.46
4    869831525.64
Name: revenue, dtype: object


x = [2016,2017,2018,2019,2020]
y = [388817482.85,559936657.37,738851449.89,851703526.46,869831525.64]
plt.title('revenue changing during 2016-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()


df_nsd = df[48:52]
df2_nsd = df_nsd.sort_values('year',ignore_index=True)
y_nsd = df2_nsd['year']
r_nsd = df2_nsd['revenue']
r_nsd

0    21323938529.08
1    21926472338.76
2    23295845261.03
3    19585185042.24
Name: revenue, dtype: object


x = [2017,2018,2019,2020]
y = [21323938529.08,21926472338.76,23295845261.03,19585185042.24]
plt.title('revenue changing during 2017-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()


df_szgd = df[52:58]
df2_szgd = df_szgd.sort_values('year',ignore_index=True)
y_szgd = df2_szgd['year']
r_szgd = df2_szgd['revenue']
r_szgd

0     811945945.86
1    1187334429.10
2    1854591461.11
3    1885325487.13
4    1980553309.06
5    1804661200.03
Name: revenue, dtype: object


x = [2015,2016,2017,2018,2019,2020]
y = [811945945.86,1187334429.10,1854591461.11,1885325487.13,1980553309.06,1804661200.03]
plt.title('revenue changing during 2015-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()


df_tfwd = df[58:64]
df2_tfwd = df_tfwd.sort_values('year',ignore_index=True)
y_tfwd = df2_tfwd['year']
r_tfwd = df2_tfwd['revenue']
r_tfwd

0     2321903112.69
1     4591656651.56
2     6519255165.45
3     7222862993.75
4     8266574620.47
5    10768700029.40
Name: revenue, dtype: object


x = [2015,2016,2017,2018,2019,2020]
y = [2321903112.69,4591656651.56,6519255165.45,7222862993.75,8266574620.47,10768700029.40]
plt.title('revenue changing during 2015-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()


df_cdkj = df[64:69]
df2_cdkj = df_cdkj.sort_values('year',ignore_index=True)
y_cdkj = df2_cdkj['year']
r_cdkj = df2_cdkj['revenue']
r_cdkj

0    10807023798.60
1    19154527743.10
2    23855512379.95
3    23526279785.46
4    26463994512.61
Name: revenue, dtype: object


x = [2016,2017,2018,2019,2020]
y = [10807023798.60,19154527743.10,23855512379.95,23526279785.46,26463994512.61]
plt.title('revenue changing during 2016-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()

生成年报数目最多的10家公司¶

import包¶

标准化年报文件名¶

计算公司年报数目¶

创建并生成这10家公司年报的csv¶

提取csv中的年报链接¶

导入包¶

读取csv生成链接¶

一个示例¶

请求下载pdf内容¶

读取年报中的营业收入内容¶

提取样例¶

定义提取营业收入的函数¶

设置循环，运用函数读取所有年报内容¶

以dataframe的形式输出提取内容¶

将df的内容按照公司名称归类排序¶

绘制图像¶

导入库¶

绘制图像¶

上海贝岭营业收入图像¶

提取上海贝岭内容并按照年份排序¶

上海贝岭图像¶

华天科技图像¶

华微电子¶

南大光电¶

台基股份¶

士兰微¶

康强电子¶

有研新材¶

欧比特¶

纳思达¶

苏州固锝¶

通富微电¶

长电科技¶

	company	year	revenue
0	上海贝岭	2019	878629217.06
1	上海贝岭	2018	784344437.44
2	上海贝岭	2016	509093878.16
3	上海贝岭	2017	561873977.23
4	上海贝岭	2020	1332205745.67
...	...	...	...
64	长电科技	2019	23526279785.46
65	长电科技	2017	23855512379.95
66	长电科技	2020	26463994512.61
67	长电科技	2016	19154527743.10
68	长电科技	2015	10807023798.60