import pandas as pd
import openpyxl
import re
xlsx = '证券行业.xlsx'
df = pd.read_excel(xlsx)#读取excel表格中的内容
exf = openpyxl.load_workbook(xlsx)
sheet = exf.active#选择"活跃"(有内容)的表格
C2 = sheet['C2']
C = sheet['C']
links = [c.value for c in C]
links_1 = links[1:-1]
links_2 = ''.join(links_1)
p = re.compile('"(.*?)","(.*?)"')
list_of_tuple = p.findall(links_2)
df2 = pd.DataFrame({'link': [t[0] for t in list_of_tuple],
                    'f_name': [t[1] for t in list_of_tuple]})
print(df2)
df2.to_csv('证券行业.csv')

                                                  link            f_name
0    http://news.windin.com/ns/bulletin.php?code=2B...    方正证券:2020年年度报告
1    http://news.windin.com/ns/bulletin.php?code=D9...  方正证券:2020年年度报告摘要
2    http://news.windin.com/ns/bulletin.php?code=8D...    长江证券:2020年年度报告
3    http://news.windin.com/ns/bulletin.php?code=8C...  长江证券:2020年年度报告摘要
4    http://news.windin.com/ns/bulletin.php?code=B6...    西南证券:2020年年度报告
..                                                 ...               ...
495  http://news.windin.com/ns/bulletin.php?code=85...    国金证券:2014年年报摘要
496  http://news.windin.com/ns/bulletin.php?code=1f...      兴业证券:2014年年报
497  http://news.windin.com/ns/bulletin.php?code=63...    兴业证券:2014年年报摘要
498  http://news.windin.com/ns/bulletin.php?code=f5...      华泰证券:2014年年报
499  http://news.windin.com/ns/bulletin.php?code=68...    华泰证券:2014年年报摘要

[500 rows x 2 columns]


import re
import pandas as pd
import os
df = pd.read_csv("证券行业.csv",engine = "python",encoding = "utf-8")
p = re.compile("(?<=\d{4})(年报)|(年年报)")
f_names = [p.sub("年年度报告",f) for f in df.f_name]
df["f_name"] = f_names;del p,f_names
print(df)

     Unnamed: 0                                               link  \
0             0  http://news.windin.com/ns/bulletin.php?code=2B...   
1             1  http://news.windin.com/ns/bulletin.php?code=D9...   
2             2  http://news.windin.com/ns/bulletin.php?code=8D...   
3             3  http://news.windin.com/ns/bulletin.php?code=8C...   
4             4  http://news.windin.com/ns/bulletin.php?code=B6...   
..          ...                                                ...   
495         495  http://news.windin.com/ns/bulletin.php?code=85...   
496         496  http://news.windin.com/ns/bulletin.php?code=1f...   
497         497  http://news.windin.com/ns/bulletin.php?code=63...   
498         498  http://news.windin.com/ns/bulletin.php?code=f5...   
499         499  http://news.windin.com/ns/bulletin.php?code=68...   

               f_name  
0      方正证券:2020年年度报告  
1    方正证券:2020年年度报告摘要  
2      长江证券:2020年年度报告  
3    长江证券:2020年年度报告摘要  
4      西南证券:2020年年度报告  
..                ...  
495  国金证券:2014年年度报告摘要  
496    兴业证券:2014年年度报告  
497  兴业证券:2014年年度报告摘要  
498    华泰证券:2014年年度报告  
499  华泰证券:2014年年度报告摘要  

[500 rows x 3 columns]


def filter_links(words,df,include=True):
    ls=[]
    for word in words:
        if include:
            ls.append([word in f for f in df.f_name])
        else:
            ls.append([word not in f for f in df.f_name])
    index = []
    for r in range(len(df)):
        flag = not include
        for c in range(len(words)):
            if include:
                flag = flag or ls[c][r]
            else:
                flag = flag and ls[c][r]
        index.append(flag)
    df2 = df[index]
    return(df2)
df_all = filter_links(["摘要","问询函","社会责任","审计","财务","风险","债券",],df,include= False)
df_orig = filter_links(["（","("],df_all,include = False)
df_updt = filter_links(["（","("],df_all,include = True)
df_updt = filter_links(["取消"],df_updt,include = False)
print(df_orig)

     Unnamed: 0                                               link  \
0             0  http://news.windin.com/ns/bulletin.php?code=2B...   
2             2  http://news.windin.com/ns/bulletin.php?code=8D...   
4             4  http://news.windin.com/ns/bulletin.php?code=B6...   
6             6  http://news.windin.com/ns/bulletin.php?code=6B...   
8             8  http://news.windin.com/ns/bulletin.php?code=F8...   
..          ...                                                ...   
490         490  http://news.windin.com/ns/bulletin.php?code=71...   
492         492  http://news.windin.com/ns/bulletin.php?code=44...   
494         494  http://news.windin.com/ns/bulletin.php?code=3a...   
496         496  http://news.windin.com/ns/bulletin.php?code=1f...   
498         498  http://news.windin.com/ns/bulletin.php?code=f5...   

             f_name  
0    方正证券:2020年年度报告  
2    长江证券:2020年年度报告  
4    西南证券:2020年年度报告  
6    东吴证券:2020年年度报告  
8     太平洋:2020年年度报告  
..              ...  
490  东北证券:2014年年度报告  
492  国元证券:2014年年度报告  
494  国金证券:2014年年度报告  
496  兴业证券:2014年年度报告  
498  华泰证券:2014年年度报告  

[243 rows x 3 columns]


def sub_with_update(df_updt,df_orig):
    df_newest = df_orig.copy()
    index_orig=[]
    index_updt=[]
    for i,f in enumerate(df_orig.f_name):
         for j,fn in enumerate(df_updt.f_name):
             if f in fn:
                  index_orig.append(i)
                  index_updt.append(j)
    for n in range(len(index_orig)):
             i = index_orig[n]
             j = index_updt[n]
             df_newest.iloc[i,-2] = df_updt.iloc[j,-2]
    return(df_newest)
df_newest = sub_with_update(df_updt,df_orig)
df_newest.sort_values(by = ["f_name"],inplace=True,ignore_index=True)
df_newest["公司简称"] = [f[:4] for f in df_newest.f_name]
print(df_newest)

     Unnamed: 0                                               link  \
0           449  http://news.windin.com/ns/bulletin.php?code=32...   
1           353  http://news.windin.com/ns/bulletin.php?code=50...   
2           275  http://news.windin.com/ns/bulletin.php?code=2E...   
3           113  http://news.windin.com/ns/bulletin.php?code=83...   
4            26  http://news.windin.com/ns/bulletin.php?code=8D...   
..          ...                                                ...   
238         439  http://news.windin.com/ns/bulletin.php?code=cd...   
239         398  http://news.windin.com/ns/bulletin.php?code=A1...   
240         276  http://news.windin.com/ns/bulletin.php?code=72...   
241         115  http://news.windin.com/ns/bulletin.php?code=9B...   
242           2  http://news.windin.com/ns/bulletin.php?code=8D...   

             f_name  公司简称  
0    东兴证券:2015年年度报告  东兴证券  
1    东兴证券:2016年年度报告  东兴证券  
2    东兴证券:2017年年度报告  东兴证券  
3    东兴证券:2019年年度报告  东兴证券  
4    东兴证券:2020年年度报告  东兴证券  
..              ...   ...  
238  长江证券:2015年年度报告  长江证券  
239  长江证券:2016年年度报告  长江证券  
240  长江证券:2017年年度报告  长江证券  
241  长江证券:2019年年度报告  长江证券  
242  长江证券:2020年年度报告  长江证券  

[243 rows x 4 columns]


counts = df_newest["公司简称"].value_counts()
ten_company = []
for cn in counts.index[:10]:
    ten_company.append(filter_links([cn],df_newest))
if not os.path.exists("10companies"):
    os.makedirs("10companies")
for df_com in ten_company:
    cn = df_com["公司简称"].iloc[0]
    df_com.to_csv("10companies/%s.csv" % cn)
ten_csv = os.listdir("10companies")


import re
import os
import requests
import pandas as pd
import time
for info in os.listdir('10companies'):#利用for循环对文件夹中十个csv文件分别处理获取链接
    domain = os.path.abspath(r'10companies') #获取文件夹的路径
    info = os.path.join(domain,info) #将路径与文件名结合起来就是每个文件的完整路径  
    df = pd.read_csv(info)
    links = df["link"];f_names = df["f_name"]
    def get_PDF_url(url):
      r = requests.get(url); r.encoding = 'utf-8'; html = r.text
      r.close() # 已获取html内容，结束connection
      p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>', re.DOTALL)
      a = p.search(html) # 因第一个<a>即是目标标签，故用search
      if a is None:
          Warning('没有找到下载链接。请手动检查链接：%s' % url)
          return()
      else:
          href = a.group(1); fname = a.group(2).strip()
      href = r.url[:26] + href # 形成完整的链接
      return((href,fname))

    hrefs = []; fnames = []
    for link in links:
      href,fname = get_PDF_url(link)
      hrefs.append(href)
      fnames.append(fname)
      time.sleep(0)
    df_final_links = pd.DataFrame({'href': hrefs,
                               'f_name': fnames})
    ste = info[-8:-4]#将各个公司的名称赋予ste变量
    df_final_links.to_csv("final_links_"+ste+".csv")#将不同公司的年报链接分别储存在不同的csv文件


import os
import requests
import pandas as pd
import time
for info in os.listdir('10companies'):#通过for循环对不同csv文件分别进行处理
    domain = os.path.abspath(r'10companies') #获取文件夹的路径
    info = os.path.join(domain,info) #将路径与文件名结合起来就是每个文件的完整路径  
    df = pd.read_csv(info)
    ste = info[-8:-4]
    df_final_links = pd.read_csv("final_links_"+ste+".csv")
    hrefs = df_final_links["href"]
    f_names = df_final_links["f_name"]
    for i in range(len(hrefs)):#对每个csv文件中已生成的链接通过for循环进行下载
      href = hrefs[i];f_name = f_names[i]
      r = requests.get(href,allow_redirects=True)
      open('%s'%f_name,'wb').write(r.content)
      time.sleep(0)
    r.close()


import fitz # pip install pymupdf
import re
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
df = pd.DataFrame()#创建一个空表格
i = 0 
for info in os.listdir('10companies'):
    domain = os.path.abspath(r'10companies') #获取文件夹的路径
    info = os.path.join(domain,info) 
    ste = info[-8:-4]#将公司名称赋给st3
    filenames = os.listdir(ste)#获取各个公司文件夹中pdf文件的名称
    sale = []
    for pdf in filenames:
        pdf = "\\"+pdf
        x = "C:\\Users\\Administrator\\.ipython\\python\\final_exam_report\\"+ste+pdf#形成路径链接（直接用pdf会打不开）
        def getText(pdf):#定义函数获取文本
            text = ''
            doc = fitz.open(pdf)
            for page in doc:
                text += page.getText()
            doc.close()
            text = text.replace(" "," \n")
            text = text.replace("\n\n","\n")#由于后续subp匹配过程中，有的数字后面没有换行符，无法成功进行非贪婪的匹配，所以通过文本内部符号替换
            return(text)
        def get_content(pdf):
            text = getText(pdf)
            p = re.compile('(?<=\\n)\D、\s*\D*?主要\D*?数据\D*?\s*(?=\\n)(.*?)经营活动产生的',re.DOTALL)#定位各个年报固定位置的内容
            content = p.search(text).group(0)
            return(content)
        def parse_data_line(pdf):
            content = get_content(pdf)
            subp = "([0-9,.%\- ]*?)\n"
            psub = "%s%s%s%s" % (subp,subp,subp,subp)
            p =re.compile("(?<=\\n)营业(\D*?\n)+%s" % psub)#定义营业收入那一行的内容
            lines = p.search(content_1)
            lines = lines[0]#形成列表内容
            return(lines)
        sale_gain = parse_data_line(x)
        sale_gain = sale_gain.split("\n")#将列表里的字符串以换行符进行分割，形成新的列表
        sale_gain = sale_gain[1]#取列表中第二个字符串，即营业收入
        sale.append(sale_gain)#将营业收入放入新的列表
    df.insert(i, ste, sale)#以列为单位加入表格
    i=i+1
print(df)
df.to_csv("9companies")#将循环后生成的DataFrame表格形成一个新的csv文件

                东北证券               中信证券               光大证券               兴业证券  \
0  5,184,163,900.03   5,184,163,900.03   5,184,163,900.03   5,184,163,900.03    
1  5,184,163,900.03   5,184,163,900.03   5,184,163,900.03   5,184,163,900.03    
2  5,184,163,900.03   5,184,163,900.03   5,184,163,900.03   5,184,163,900.03    
3  5,184,163,900.03   5,184,163,900.03   5,184,163,900.03   5,184,163,900.03    
4  5,184,163,900.03   5,184,163,900.03   5,184,163,900.03   5,184,163,900.03    
5  5,184,163,900.03   5,184,163,900.03   5,184,163,900.03   5,184,163,900.03    
6  5,184,163,900.03   5,184,163,900.03   5,184,163,900.03   5,184,163,900.03    

                国信证券               国元证券               国海证券               招商证券  \
0  5,184,163,900.03   5,184,163,900.03   5,184,163,900.03   5,184,163,900.03    
1  5,184,163,900.03   5,184,163,900.03   5,184,163,900.03   5,184,163,900.03    
2  5,184,163,900.03   5,184,163,900.03   5,184,163,900.03   5,184,163,900.03    
3  5,184,163,900.03   5,184,163,900.03   5,184,163,900.03   5,184,163,900.03    
4  5,184,163,900.03   5,184,163,900.03   5,184,163,900.03   5,184,163,900.03    
5  5,184,163,900.03   5,184,163,900.03   5,184,163,900.03   5,184,163,900.03    
6  5,184,163,900.03   5,184,163,900.03   5,184,163,900.03   5,184,163,900.03    

                西南证券  
0  5,184,163,900.03   
1  5,184,163,900.03   
2  5,184,163,900.03   
3  5,184,163,900.03   
4  5,184,163,900.03   
5  5,184,163,900.03   
6  5,184,163,900.03


import fitz # pip install pymupdf
import re
import pandas as pd
import os
import csv
csv_data = pd.read_csv("9companies")
csv_df = pd.DataFrame(csv_data)
csv_df_new = csv_df.iloc[:7,1:10]#得到原先9个公司营业收入的表格
ste = "西部证券"
filenames = os.listdir(ste)
df = pd.DataFrame()
i = 0 
sale = []
for pdf in filenames:
        pdf = "\\"+pdf
        x = "C:\\Users\\Administrator\\.ipython\\python\\final_exam_report\\"+ste+pdf
        def getText(pdf):
            text = ''
            doc = fitz.open(pdf)
            for page in doc:
                text += page.getText()
            doc.close()
            return(text)
        def get_content(pdf):
            text = getText(pdf)
            p = re.compile('(?<=\\n)\D、\s*\D*?主要\D*?数据\D*?\s*(?=\\n)(.*?)经营活动产生的',re.DOTALL)
            content = p.search(text)
            return(content)
        content = get_content(x)
        content_1 = content[0]
        def parse_data_line(pdf):
            content = get_content(pdf)
            content_1 = content[0]
            subp = "([0-9,.%\- ]*?)\n"
            psub = "%s%s%s%s" % (subp,subp,subp,subp)
            p =re.compile("(?<=\\n)营业(\D*?\n)+%s" % psub)
            lines = p.search(content_1)
            lines = lines[0]
            return(lines)
        sale_gain = parse_data_line(x)
        sale_gain = sale_gain.split("\n")
        sale_gain = sale_gain[1]
        sale.append(sale_gain)
df.insert(i, ste, sale)#获得含有西部证券公司2014，2016—2020年营业收入的表格
print(df)

Deprecation: 'getText' removed from class 'Page' after v1.19 - use 'get_text'.
mupdf: kid not found in parent's kids array
mupdf: kid not found in parent's kids array
mupdf: kid not found in parent's kids array
mupdf: kid not found in parent's kids array
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object
mupdf: invalid page object

                  西部证券
0    1,938,470,480.53 
1   3,406,329,331.96  
2    3,169,944,961.76 
3    2,237,341,729.03 
4    3,680,544,587.40 
5    5,184,163,900.03


def df_lastnew(df):
    df1 = df.loc[:0]
    df2 = df.loc[1:]
    df3 = pd.DataFrame({"西部证券":["5640878792.18"]})
    df = df1.append(df3,ignore_index = True).append(df2,ignore_index = True)
    return(df)
df_new = df_lastnew(df)#获得西部证券公司所有年份营业收入的表格
df_want = pd.concat([csv_df_new,df_new],axis = 1)#将所有公司的营业收入汇总成一张表格
print(df_want)
df_want.to_csv("10companies_data")#储存至"10companies_data"csv文件中

                东北证券                中信证券                光大证券  \
0   3,090,984,262.41  29,197,531,133.19    6,601,422,929.86    
1  6,745,760,224.97   38,001,923,489.02   16,571,087,246.74    
2  4,481,628,728.45    43,291,634,080.53   9,164,639,102.50    
3  4,926,111,998.63   37,220,708,075.49    9,838,147,762.07    
4  6,780,105,834.67   43,139,697,642.01    7,712,277,101.82    
5  7,968,795,586.85   56,013,436,032.55   10,057,362,378.64    
6  6,609,613,343.83   54,382,730,241.56   15,866,343,425.84    

                 兴业证券                国信证券              国元证券  \
0   5,609,064,896.32   11,792,322,619.69   3,486,036,145.60   
1  11,540,612,657.75   29,139,131,599.01   5,773,382,071.47   
2   7,589,066,883.90    12,748,903,313.78  3,375,520,490.03   
3   8,818,781,467.80        1,192,361.02   3,510,702,162.26   
4   6,499,373,437.14        1,003,093.19   2,537,907,348.38   
5  14,249,535,861.49        1,409,291.46   3,198,808,368.09   
6  17,579,687,208.80        1,878,407.12   4,528,625,617.88   

                国海证券                招商证券               西南证券  \
0   2,544,981,928.34  11,002,468,274.67   3,674,829,197.88    
1  4,959,157,199.17   11,695,453,558.82   8,496,799,180.16    
2  3,837,581,191.88   13,353,213,641.86   3,631,659,620.90    
3  1,817,078,708.13   11,321,611,555.03   3,060,764,762.10    
4  2,122,602,077.80   18,708,369,944.73   2,744,154,393.47    
5  3,560,208,077.82   24,277,670,240.59   3,488,837,437.18    
6  4,482,015,217.40    25,291,794,057.95  3,169,571,453.97    

                  西部证券  
0    1,938,470,480.53   
1        5640878792.18  
2   3,406,329,331.96    
3    3,169,944,961.76   
4    2,237,341,729.03   
5    3,680,544,587.40   
6    5,184,163,900.03


import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
import csv
from matplotlib.pyplot import MultipleLocator
import numpy as np
csv_data1 = pd.read_csv("10companies_data")
csv_df1 = pd.DataFrame(csv_data1)
csv_df_new1 = csv_df1.iloc[:7,1:11]#得到包含所有公司不同年份营业收入的表格
list_row = csv_df_new1.values.tolist()#以行为单位取成列表
list_name = list(csv_df_new1)#取行业名称
columns = csv_df_new1.columns
list_columns = []
for c in columns:
    d = csv_df_new1[c].values.tolist()
    list_columns.append(d)#以列为单位取成列表
for i in range(len(list_row)):
    print(list_row[i])
    print("\n")

['3,090,984,262.41', '29,197,531,133.19 ', '6,601,422,929.86 ', '5,609,064,896.32 ', '11,792,322,619.69 ', '3,486,036,145.60', '2,544,981,928.34', '11,002,468,274.67 ', '3,674,829,197.88 ', '1,938,470,480.53 ']


['6,745,760,224.97 ', '38,001,923,489.02 ', '16,571,087,246.74 ', '11,540,612,657.75 ', '29,139,131,599.01 ', '5,773,382,071.47', '4,959,157,199.17 ', '11,695,453,558.82 ', '8,496,799,180.16 ', '5640878792.18']


['4,481,628,728.45 ', '43,291,634,080.53', '9,164,639,102.50 ', '7,589,066,883.90 ', '12,748,903,313.78', '3,375,520,490.03', '3,837,581,191.88 ', '13,353,213,641.86 ', '3,631,659,620.90 ', ' 3,406,329,331.96  ']


['4,926,111,998.63 ', '37,220,708,075.49 ', '9,838,147,762.07 ', '8,818,781,467.80 ', '1,192,361.02 ', '3,510,702,162.26', '1,817,078,708.13 ', '11,321,611,555.03 ', '3,060,764,762.10 ', '3,169,944,961.76 ']


['6,780,105,834.67 ', '43,139,697,642.01 ', '7,712,277,101.82 ', '6,499,373,437.14 ', '1,003,093.19 ', '2,537,907,348.38', '2,122,602,077.80 ', '18,708,369,944.73 ', '2,744,154,393.47 ', '2,237,341,729.03 ']


['7,968,795,586.85 ', '56,013,436,032.55 ', '10,057,362,378.64 ', '14,249,535,861.49 ', '1,409,291.46 ', '3,198,808,368.09', '3,560,208,077.82 ', '24,277,670,240.59 ', '3,488,837,437.18 ', '3,680,544,587.40 ']


['6,609,613,343.83 ', '54,382,730,241.56 ', '15,866,343,425.84 ', '17,579,687,208.80 ', '1,878,407.12 ', '4,528,625,617.88', '4,482,015,217.40 ', '25,291,794,057.95', '3,169,571,453.97 ', '5,184,163,900.03 ']


print(list_name)

['东北证券', '中信证券', '光大证券', '兴业证券', '国信证券', '国元证券', '国海证券', '招商证券', '西南证券', '西部证券']


for i in range(len(list_columns)):
    print(list_columns[i])
    print("\n")

['3,090,984,262.41', '6,745,760,224.97 ', '4,481,628,728.45 ', '4,926,111,998.63 ', '6,780,105,834.67 ', '7,968,795,586.85 ', '6,609,613,343.83 ']


['29,197,531,133.19 ', '38,001,923,489.02 ', '43,291,634,080.53', '37,220,708,075.49 ', '43,139,697,642.01 ', '56,013,436,032.55 ', '54,382,730,241.56 ']


['6,601,422,929.86 ', '16,571,087,246.74 ', '9,164,639,102.50 ', '9,838,147,762.07 ', '7,712,277,101.82 ', '10,057,362,378.64 ', '15,866,343,425.84 ']


['5,609,064,896.32 ', '11,540,612,657.75 ', '7,589,066,883.90 ', '8,818,781,467.80 ', '6,499,373,437.14 ', '14,249,535,861.49 ', '17,579,687,208.80 ']


['11,792,322,619.69 ', '29,139,131,599.01 ', '12,748,903,313.78', '1,192,361.02 ', '1,003,093.19 ', '1,409,291.46 ', '1,878,407.12 ']


['3,486,036,145.60', '5,773,382,071.47', '3,375,520,490.03', '3,510,702,162.26', '2,537,907,348.38', '3,198,808,368.09', '4,528,625,617.88']


['2,544,981,928.34', '4,959,157,199.17 ', '3,837,581,191.88 ', '1,817,078,708.13 ', '2,122,602,077.80 ', '3,560,208,077.82 ', '4,482,015,217.40 ']


['11,002,468,274.67 ', '11,695,453,558.82 ', '13,353,213,641.86 ', '11,321,611,555.03 ', '18,708,369,944.73 ', '24,277,670,240.59 ', '25,291,794,057.95']


['3,674,829,197.88 ', '8,496,799,180.16 ', '3,631,659,620.90 ', '3,060,764,762.10 ', '2,744,154,393.47 ', '3,488,837,437.18 ', '3,169,571,453.97 ']


['1,938,470,480.53 ', '5640878792.18', ' 3,406,329,331.96  ', '3,169,944,961.76 ', '2,237,341,729.03 ', '3,680,544,587.40 ', '5,184,163,900.03 ']


def change_type(list_x):
    list_want=[]
    for i in range(len(list_x)):
        x_a = []
        for j in range(len(list_x[1])):
            a_a = list_x[i][j]
            a_b = a_a.replace(",","")#将字符串中的,替换为空格
            a_c = float(a_b)
            a_d = a_c / 10**8#将数值缩小为亿分之一,便于在后续图标上展示
            a_e = round(a_d,2)#保留两位小数
            x_a.append(a_e)
        list_want.append(x_a)
    return(list_want)
list_row_1 = change_type(list_row)
list_columns_1 = change_type(list_columns)


for i in range(len(list_row_1)):
    print(list_row_1[i])

[30.91, 291.98, 66.01, 56.09, 117.92, 34.86, 25.45, 110.02, 36.75, 19.38]
[67.46, 380.02, 165.71, 115.41, 291.39, 57.73, 49.59, 116.95, 84.97, 56.41]
[44.82, 432.92, 91.65, 75.89, 127.49, 33.76, 38.38, 133.53, 36.32, 34.06]
[49.26, 372.21, 98.38, 88.19, 119.24, 35.11, 18.17, 113.22, 30.61, 31.7]
[67.8, 431.4, 77.12, 64.99, 100.31, 25.38, 21.23, 187.08, 27.44, 22.37]
[79.69, 560.13, 100.57, 142.5, 140.93, 31.99, 35.6, 242.78, 34.89, 36.81]
[66.1, 543.83, 158.66, 175.8, 187.84, 45.29, 44.82, 252.92, 31.7, 51.84]


for i in range(len(list_columns_1)):
    print(list_columns_1[i])

[30.91, 67.46, 44.82, 49.26, 67.8, 79.69, 66.1]
[291.98, 380.02, 432.92, 372.21, 431.4, 560.13, 543.83]
[66.01, 165.71, 91.65, 98.38, 77.12, 100.57, 158.66]
[56.09, 115.41, 75.89, 88.19, 64.99, 142.5, 175.8]
[117.92, 291.39, 127.49, 119.24, 100.31, 140.93, 187.84]
[34.86, 57.73, 33.76, 35.11, 25.38, 31.99, 45.29]
[25.45, 49.59, 38.38, 18.17, 21.23, 35.6, 44.82]
[110.02, 116.95, 133.53, 113.22, 187.08, 242.78, 252.92]
[36.75, 84.97, 36.32, 30.61, 27.44, 34.89, 31.7]
[19.38, 56.41, 34.06, 31.7, 22.37, 36.81, 51.84]


zhfont1 = matplotlib.font_manager.FontProperties(fname="C:\\Windows\\\Fonts\\SimHei.TTF")
name_list = ["2014","2015","2016","2017","2018","2019","2020"]
def x_ticks(list_columns,list_name):
    num_list = list_columns
    rects = plt.bar(range(len(list_columns)),num_list,color="rgb",width = 1,tick_label=name_list)
    plt.title(list_name+"2014——2020营业收入对比",fontproperties = zhfont1)
    plt.xlabel("年份",fontproperties = zhfont1)
    plt.ylabel("营业收入（亿元）",fontproperties = zhfont1)
    for rect in rects:  
        height = rect.get_height()
        plt.text(rect.get_x() + rect.get_width() / 2, height, str(height), size=10, ha="center", va="bottom")
    plt.savefig(list_name +".png",dpi = 600)
    plt.show()
for i in range(len(list_columns)):
    x_ticks(list_columns_1[i], list_name[i])

<ipython-input-11-75ed3f2431c4>:5: MatplotlibDeprecationWarning: Using a string of single character colors as a color sequence is deprecated since 3.2 and will be removed two minor releases later. Use an explicit list instead.
  rects = plt.bar(range(len(list_columns)),num_list,color="rgb",width = 1,tick_label=name_list)


list_name_1 = []
for i in range(len(list_name)):#保留公司名称的前两个字
    c_a = list_name[i]
    c_b = c_a[0:2]
    list_name_1.append(c_b)
def y_ticks(list_row,name_list):
    num_list_1 = list_row
    rects = plt.barh(range(len(list_row)),num_list_1,color='rgby')
    N = 10
    index = np.arange(N)
    plt.yticks(index,list_name_1,fontproperties = zhfont1)
    plt.title(name_list+"不同公司营业收入对比",fontproperties = zhfont1)
    plt.xlabel("营业收入（亿元）",fontproperties = zhfont1)
    plt.ylabel("公司名称",fontproperties = zhfont1)
    for rect in rects:  
        w=rect.get_width()
        plt.text(w,rect.get_y()+rect.get_height()/2,w,size =10,ha='left',va='center')
    plt.savefig(name_list +".png",dpi = 600)
    plt.show()
for i in range(len(list_row)):
    y_ticks(list_row_1[i], name_list[i])

<ipython-input-12-0a4af112d88f>:8: MatplotlibDeprecationWarning: Using a string of single character colors as a color sequence is deprecated since 3.2 and will be removed two minor releases later. Use an explicit list instead.
  rects = plt.barh(range(len(list_row)),num_list_1,color='rgby')

期末实验报告：同一行业十家公司不同年份营业收入横向、纵向对比¶

将"证券行业.xlsx"转换为"证券行业.csv"¶

对“证券行业.csv”中f_name列进行重命名并进行筛选，保留所需要的年报内容¶

对f_name列进行匹配重命名¶

定义"filter_links"函数用于对表格f_name一列进行筛选，保留所需要的年报¶

定义"sub_with_update"函数，通过该函数对f_name列进行整理，以公司简称分类¶

根据排列情况，提取年报数量最多的十家公司，并将链接存储于"10companies"文件夹中¶

根据所形成的"10companies"文件夹，对内部的csv文件进行拆分，形成下载pdf年报的链接¶

通过提取下载后csv文件里的链接，下载各家公司各个年份的年报pdf文件¶

为了后续操作简便，已将下载好的pdf文件放到不同的文件夹（手动操作），并进行相应命名，截图如下¶

通过正则匹配定位年报中主要会计数据——当年的营业收入，提取放入DataFrame¶

遇到的问题1：需要不断调整正则表达式，写出满足可以匹配所有年报的通用表达式¶

遇到的问题2：在运行过程中发现"西部证券"公司2015年年报不是text文本无法提取内容。¶

由于问题2的存在，所以我将"西部证券"公司移出文件夹，后续单独提取该公司各年份的营业收入¶

单独提取"西部证券"公司的营业收入，放入表格，并将该表格和已获取的9个公司营业收入的表格合并为一个表格¶

存在的问题3：由于2015年年报不是文本，问了能循环运行程序，将15年年报移出"西部证券"文件夹,获取14，16-20年的营业收入¶

查看西部证券公司2015年营业收入，形成完整的西部证券公司营业收入的表格，再将表格与原先含有9个公司营收的表格合并，得到所需要的最终表格¶

通过表格制作营业收入的对比图¶

以行为单位取营业收入为列表，并将列表中的字符串类型转为浮点型¶

定义change_type的函数，将营业收入由字符串类型转变为浮点型，并保留两位小数¶

制图：定义x_ticks函数，由于制作同一公司（十家）不同年份营业收入的对比即纵向对比（柱状图）¶

纵向对比图¶

制图：定义y_ticks函数，用于制作不同公司同一年份（2014——2020）营业收入的对比即横向对比（条形图）¶

横向对比图¶