实验报告：通讯行业分析¶

第一步：读取通讯行业的Excel，并转为CSV格式¶

import pandas as pd
import openpyxl
import re

xlsx = 'E:/anaconda/通讯行业.xlsx'

df = pd.read_excel(xlsx)

exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']

links = [c.value for c in C]

links_1 = links[1:-1]
links_2 = ''.join(links_1)

sample  = '=HYPERLINK("http://news.windin.com/ns/bulletin.php?code=239751740736&id=126293652&type=1","星网锐捷:2019年年度报告(更新后)")'

p = re.compile('"(.*?)","(.*?)"')
list_of_tuple = p.findall(links_2)

df2 = pd.DataFrame({'link':[t[0] for t in list_of_tuple], 'f_name':[t[1] for t in list_of_tuple]})

df2.to_csv('通讯行业.csv')

第二步：读取CSV文件，筛选符合条件的文件并下载¶

import requests
import re 
import pandas as pd
import os
import fitz
import time

df = pd.read_csv('E:/anaconda/通讯行业.csv', encoding='gb2312')
p = re.compile('(?<=\d{4}(年度))')
f_names = [p.sub('年年度报告', f) for f in df.f_name]
df['f_name'] = f_names; del p,f_names

def filter_links(words,df,include=True):
    ls = []
    for word in words:
        if include:
            ls.append([word in f for f in df.f_name])
        else:
            ls.append([word not in f for f in df.f_name])
    index = []
    for r in range(len(df)):
        flag  = not include
        for c in range(len(words)):
            if include:
                ls.append([word not in f for f in df.f_name])
                index=[]
                for r in range(len(df)):
                    flag=not include
                    for c in range(len(words)):
                        if include:
                            flag = flag or ls[c][r]
                        else:
                            flag = flag and ls[c][r]
                    index.append(flag)
                df2=df[index]
                return(df2)
                
df_all = filter_links(['摘要','问询函','社会责任','审计','财务','风险','债券'],df,include=[False])
df_orig = filter_links(['(','('],df_all,include=[False])
df_updt = filter_links(['(','('],df_all,include=[True])
df_updt = filter_links(['取消'],df_updt,include=[False])

def sub_with_update(df_updt,df_orig):
    df_newest = df_orig.copy()
    index_orig=[]
    index_updt=[]
    for i,f in enumerate(df_orig.f_name):
        for j,fn in enumerate(df_updt.f_name):
            if f in fn:
                index_orig.append(i)
                index_updt.append(j)
    for n in range(len(index_orig)):
        i = index_orig[n]
        j = index_updt[n]
        df_orig.iloc[i,-2] = df_updt.iloc[j,-2]
    return(df_newest)

df_newest  = sub_with_update(df_updt,df_orig)
df_all.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]

counts = df_newest['公司简称'].value_counts()

ten_company = []

for cn in counts.index[:10]:
    ten_company.append(filter_links([cn],df_newest))
    
if not os.path.exists('10companies'):
    os.makedirs('10companies')
    
for df_com in ten_company:
    cn=df_com['公司简称'].iloc[0]
    df_com.to_csv('10companies/%s.csv' % cn)
    
ten_csv=os.listdir('10companies')
    
os.chdir('E:/anaconda/10companies')
f_1=os.listdir()

links = []
f_names = []
  
def get_PDF_url(url):
    r = requests.get(url);r.encoding = 'utf-8'; html = r.text
    r.close() 
    p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>', re.DOTALL)
    a = p.search(html) 
    if a is None:
        Warning('没有找到下载链接。请手动检查链接：%s' % url)
        return()
    else:
        href = a.group(1); fname = a.group(2).strip()
    href = r.url[:26] + href 
    return((href,fname))

hrefs=[];fnames=[]

for link in links:
    href,fname = get_PDF_url(link)
    hrefs.append(href)
    fnames.append(fname)
    df_final_links=pd.DataFrame({'href':hrefs,'fname':fnames})
    df_final_links.to_csv('E:/anaconda/10companies/通讯links.csv')
    
df_final_links=pd.read_csv('E:/anaconda/10companies/通讯links.csv')
f_names=df_final_links['fname']
hrefs=df_final_links['href']
for i in range(len(hrefs)):
    href=hrefs[i];f_name=f_names[i]
    r = requests.get(href, allow_redirects=True)
    open('%s' %f_name, 'wb').write(r.content)
    time.sleep(10)
r.close()

第三步：读取相关的数据内容，并画出图表分析，首先是十个企业近十年的营业总收入对比图¶

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline
rcParams['font.family'] = 'simhei'

df = pd.read_excel("E:/anaconda/10companies/工作簿1.xlsx")
print(df)

plt.plot(df["年份"],df["st新海"],label='st新海',linewidth=1,color='r',markersize=12)
plt.plot(df["年份"],df["春兴精工"],label='春兴精工',linewidth=1,color='y',markersize=12)
plt.plot(df["年份"],df["光弘科技"],label='光弘科技',linewidth=1,color='blue',markersize=12)
plt.plot(df["年份"],df["汇源通信"],label='汇源通信',linewidth=1,color='purple',markersize=12)
plt.plot(df["年份"],df["科创新源"],label='科创新源',linewidth=1,color='green',markersize=12)
plt.plot(df["年份"],df["立昂技术"],label='立昂技术',linewidth=1,color='brown',markersize=12)
plt.plot(df["年份"],df["日海智能"],label='日海智能',linewidth=1,color='black',markersize=12)
plt.plot(df["年份"],df["润建股份"],label='润建股份',linewidth=1,color='pink',markersize=12)
plt.plot(df["年份"],df["星网锐捷"],label='星网锐捷',linewidth=1,color='grey',markersize=12)
plt.plot(df["年份"],df["亿联网络"],label='亿联网络',linewidth=1,color='violet',markersize=12)

plt.xlabel("年份")
plt.ylabel('营业总收入')
plt.title("营业收入走势图")
plt.legend()
plt.grid()
plt.show()

      年份     st新海     春兴精工     光弘科技    汇源通信    科创新源     立昂技术     日海智能  \
0   2021   180.06  2083.73  2210.39  327.94  452.34   831.06  3103.66   
1   2020   158.83  5153.25  2285.41  485.11  307.00  1006.43  3783.36   
2   2019   265.04  7261.65  2190.44  434.27  305.98  1381.97  4639.82   
3   2018   766.34  4932.83  1598.09  426.71  285.88   668.72  4420.09   
4   2017  1604.72  3804.48  1274.11  390.33  253.75   973.20  2989.79   
5   2016  1900.60  2535.86  1227.02  454.98  188.32   362.78  2706.79   
6   2015  1741.98  2117.75   917.53  434.95  154.22   277.61  2869.25   
7   2014  1142.27  2220.16   738.36  474.89  158.94   249.02  2428.74   
8   2013   823.43  1244.75      NaN  412.42     NaN   305.62  2079.22   
9   2012   817.02   896.14      NaN  438.75     NaN      NaN  1925.10   
10  2011   785.07   827.22      NaN  480.70     NaN      NaN  1333.11   

       润建股份      星网锐捷     亿联网络  
0   4580.79   9191.19  2528.58  
1   4192.63  10304.23  2754.29  
2   3717.02   9265.77  2489.35  
3   3231.68   9131.57  1815.36  
4   2778.39   7705.14  1387.76  
5   2293.12   5687.66   923.74  
6   1513.45   4516.51   661.91  
7       NaN   3641.69   488.36  
8       NaN   3276.16      NaN  
9       NaN   2787.37      NaN  
10      NaN   2644.27      NaN

第四步：分析这十个公司近五年，每年的营业成本，并作图对比¶

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams
rcParams['font.family'] = 'simhei'

data = pd.read_excel('E:/anaconda/10companies/工作簿2.xlsx').set_index(['date'])

plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['st新海','春兴精工','光弘科技','汇源通信','立昂技术','科创新源', '日海智能', '润建股份', '星网锐捷', '亿联网络']
data0=data.iloc[0]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2017年营业成本对比（单位：万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['st新海','春兴精工','光弘科技','汇源通信','立昂技术','科创新源', '日海智能', '润建股份', '星网锐捷', '亿联网络']
data0=data.iloc[1]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2018年营业成本对比（单位：万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['st新海','春兴精工','光弘科技','汇源通信','立昂技术','科创新源', '日海智能', '润建股份', '星网锐捷', '亿联网络']
data0=data.iloc[2]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2019年营业成本对比（单位：万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['st新海','春兴精工','光弘科技','汇源通信','立昂技术','科创新源', '日海智能', '润建股份', '星网锐捷', '亿联网络']
data0=data.iloc[3]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2020年营业成本对比（单位：万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['st新海','春兴精工','光弘科技','汇源通信','立昂技术','科创新源', '日海智能', '润建股份', '星网锐捷', '亿联网络']
data0=data.iloc[4]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2021年营业成本对比（单位：万元)')
plt.show()