陈诺的实验报告

代码部分一


  import fitz
  import re
  import pandas as pd
  import numpy as np
  import time
  #提取行业公司股票代码,存为CSV
  pdf = '行业分类.pdf'

  # page = doc.get_toc(2)
  def getText(pdf):
      text = ''
      doc = fitz.open(pdf)
      for page in doc:
          text += page.getText()
      doc.close()
      return(text)

  # kind=text
  kind=getText(pdf)
  kind=kind.split('\n')
  kind1=kind[3009:3186]
  del kind1[102:]
  del kind1[9:17]
  del kind1[4]
  del kind1[10]
  del kind1[40:42]
  # 将行业的名称和代码提取出来并进行处理

  kind2=[kind1[1]]
  for i in range(2,len(kind1),2):
      kind2.append(kind1[i])
  del kind2[0]

  df_com=pd.DataFrame(columns=['股票代码','公司名称'],index=range(len(kind2)))
  # 创建一个文件

  for i in range(len(kind2)):
      df_com['股票代码'][[i]]=kind2[i]
  kind2=[kind1[1]]
  # 将代码填入文件中


  for i in range(3,len(kind1)+1,2):
      kind2.append(kind1[i])
  del kind2[0]
  for i in range(len(kind2)):
      if '*' in kind2[i]:
          kind2[i]=kind2[i][1:]
      df_com['公司名称'][[i]]=kind2[i]
  # 同上


  df_com.to_csv(kind1[0]+kind1[1]+'.csv')

  # 将搜索的公司报告的页面保存为innerHTML.html
  from selenium import webdriver
  from selenium.webdriver.common.by import By
  from selenium.webdriver.common.keys import Keys
  import time


  def shou_suo(a):
      # a=df_com.iloc[0,1]
      # browser = webdriver.Edge()
      browser.get('https://www.szse.cn/disclosure/listed/fixed/index.html')
      element = browser.find_element(By.ID, 'input_code')
      element.send_keys(a+ Keys.ENTER)
      element = browser.find_element(By.ID,'disclosure-table')
      element = browser.find_element(By.CSS_SELECTOR,"#select_gonggao .glyphicon").click()
      element = browser.find_element(By.LINK_TEXT,"年度报告").click()
      element = browser.find_element(By.ID,'disclosure-table')
      return(element)


  browser = webdriver.Edge()
  for i in df_com.iloc[:,1]:
      # i=df_com.iloc[7,1]
      element=shou_suo(i)
      time.sleep(3)
      innerHTML = element.get_attribute('innerHTML')
      f = open('innerHTML%s.html'%i,'w',encoding='utf-8')
      f.write(innerHTML)
      f.close()

  class DisclosureTable():
        '''
        解析深交所定期报告页搜索表格
        '''
        def __init__(self, innerHTML):
            self.html = innerHTML
            self.prefix = 'https://disc.szse.cn/download'
            self.prefix_href = 'https://www.szse.cn/'

            #生成一个可提取a和span标签的内容的正则表达式p_a和p_span
            p_a = re.compile('(a.*?)(.*?)', re.DOTALL)
            p_span = re.compile('(.*?)', re.DOTALL)

            #定义一个函数:提取p_a组的内容,并将除去换行符的内容以列表形式返回
            self.get_code = lambda txt: p_a.search(txt).group(1).strip()
            self.get_time = lambda txt: p_span.search(txt).group(1).strip()
            #
            self.txt_to_df()

        def txt_to_df(self):
            # html table text to DataFrame
            html = self.html
            p = re.compile('(.*?)', re.DOTALL)
            trs = p.findall(html)

            p2 = re.compile('(.*?)', re.DOTALL)
            tds = [p2.findall(tr) for tr in trs[1:]]

            df = pd.DataFrame({'证券代码': [td[0] for td in tds],
                               '简称': [td[1] for td in tds],
                               '公告标题': [td[2] for td in tds],
                               '公告时间': [td[3] for td in tds]})
            self.df_txt = df

        def get_link(self, txt):
            #利用正则表达式获取PDF的attachpath,href,title
            p_txt = '(.*?)'
            p = re.compile(p_txt, re.DOTALL)
            matchObj = p.search(txt)
            attachpath = matchObj.group(1).strip()
            href       = matchObj.group(2).strip()
            title      = matchObj.group(3).strip()
            return([attachpath, href, title])

        def get_data(self):
            get_code = self.get_code
            get_time = self.get_time
            get_link = self.get_link
            #
            df = self.df_txt
            codes = [get_code(td) for td in df['证券代码']]
            short_names = [get_code(td) for td in df['简称']]
            ahts = [get_link(td) for td in df['公告标题']]
            times = [get_time(td) for td in df['公告时间']]
            #
            prefix = self.prefix
            prefix_href = self.prefix
            df = pd.DataFrame({'证券代码': codes,
                               '简称': short_names,
                               '公告标题': [aht[2] for aht in ahts],
                               'attachpath': [prefix + aht[0] for aht in ahts],
                               'href': [prefix_href + aht[1] for aht in ahts],
                               '公告时间': times
                })
            self.df_data = df
            return(df)

  for i in df_com.iloc[:,1]:
      f = open('innerHTML%s.html'%i,encoding='utf-8')
      html = f.read()
      f.close()
      dt = DisclosureTable(html)
      df = dt.get_data()
      df.to_csv(i+'data.csv')

  # i=df_com.iloc[0,1]
      # df=pd.read_csv(i+'data.csv')
  # 将所得的表格对应的网址一一下载

  import requests
  for i in range(17,len(df_com)+1):
      # i=3
      a=df_com.iloc[i,1]
      df=pd.read_csv(a+'data.csv')
      # print(i,len(df))
      for l in range(len(df)):
          # print(l)
          # l=0
          href =df.iloc[l,4]
          l=str(l)
          r = requests.get(href, allow_redirects=True)
          f = open('%s%s.pdf'%(a,l),'wb')
          f.write(r.content)
          f.close()
          r.close()
          time.sleep(3)

  # 删除多余文件
  import os
  files=os.listdir()
  pdf = [f for f in files if f.endswith('.pdf')]
  for i in range(len(pdf)):
      report1=getText(pdf[i])
      if '年度报告摘要' in report1:
          os.remove(pdf[i])
      elif '年度报告' not in report1:
          os.remove(pdf[i])

  #整理文件名
  files=os.listdir(r'C:\Users\cch\Desktop\数据作业')
  df_com=pd.read_csv('32有色金属冶炼和压延加工.csv')
  # df2=df1.drop(3,axis=0)
  # df2=df2.reset_index()
  # df2=df2.drop('index',axis=1)


  for i in range(2,len(df_com)):
      # i=1
      list1= [f for f in files if df_com.iloc[i,2] in f and f.endswith('.pdf') ]
      df1=pd.DataFrame({'文件原名':list1})
      df1['内容首行']=''
      for l in range(len(list1)):
         #  l=2
          report2=getText(list1[l])
          report2=report2.replace(' ','')
          p1 = re.compile('\w*(?=年度报告)',re.DOTALL)
          subtext = p1.findall(report2)
          new=df_com.iloc[i,2]+subtext[0]+'年年度报告.pdf'
          df1['内容首行'][[l]]=new
          os.rename(list1[l], new)
      df1.to_csv(df_com.iloc[i,2]+'标题.csv')
      # df1=pd.read_csv(df_com.iloc[i,1]+'标题.csv')


  # 用正则表达式提取出数据
  def parse_data_line(pdf):
      text = getText(pdf)
      p1 = re.compile('\w{1,2}、主要会计数据和财务指标(.*?)(?=\w{1,2}、)',re.DOTALL)
      subtext = p1.search(text)
      if subtext is None:
          p1 = re.compile('(\w{1,2})\s*主要会计数据(.*?)(?=(\w{1,2})\s*主要财务指标)',re.DOTALL)
          subtext = p1.search(text).group(0)
      else:
          subtext = p1.search(text).group(0)
      subp='([0-9,.%\- ]*?)\n' and '([0-9,.%\- ]*?)\s'
      psub='%s%s%s%s'%(subp,subp,subp,subp)
      p=re.compile('(\D+\n)+%s'%psub)
      lines=p.findall(subtext)
      return(lines)

  def get_basic(pdf):
      # pdf=list_rp[22]
      text = getText(pdf)
      p1 = re.compile('\w{1,2}、公司信息(.*?)(?=\w{1,2}、联系人)',re.DOTALL)
      subtext = p1.findall(text)
      if subtext[0] is None:
          p1 = re.compile('\w{1,2}、\s+公司信息(.*?)(?=\w{1,2}、联系人)',re.DOTALL)
          subtext = p1.findall(text)
      # subtext.remove('')
      subtext=subtext[0].replace('\n','')
      p2=re.compile('(?<=股票简称)(.*?)(?=股票代码)')
      co_name=p2.findall(subtext)
      p3=re.compile('(?<=股票代码)(.*?)(?=股票)')
      code=p3.findall(subtext)
      p4=re.compile('(?<=办公地址)(.*?)(?=办公地址的)')
      ad=p4.findall(subtext)
      p5=re.compile('(?<=公司网址)(.*?)(?=电子信箱)')
      web=p5.findall(subtext)
      return co_name,code,ad,web

  df3=pd.DataFrame(index=range(len(df_com)))
  df3['股票简称']=''
  df3['股票代码']=''
  df3['办公地址']=''
  df3['公司网址']=''
  files=os.listdir()
  list_rp=[c for c in files if c.endswith('.pdf') and '年度报告' in c and '2021'in c]
  for i in range(len(list_rp)):
      # i=22
      list3=get_basic(list_rp[i])
      for l in range(4):
          while ' ' in list3[l][0]:
              list3[l][0]=list3[l][0].replace(' ','')
      df3['股票简称'][[i]]=list3[0][0]
      df3['股票代码'][[i]]=list3[1][0]
      df3['办公地址'][[i]]=list3[2][0]
      df3['公司网址'][[i]]=list3[3][0]
  df3.to_csv('公司基本信息.csv')
  # del list_rp[24]
  # df3=df3.dropna(axis=0)
  # df3=df3.reset_index()
  # df3=df3.drop(24,axis=0)

  df3['营业收入(元)']=''
  df3['基本每股收益(元/股)']=''
  for i in range(3,len(list_rp)):
      i=22
      pdf=list_rp[i]
      lines=parse_data_line(pdf)
      income=[l for l in lines[0]]
      while '' in income:
          income.remove('')
      while ',' in income[1]:
          income[1]=income[1].replace(',','')
      # while ',' in income[1]:
      #     income[1]=income[1].replace(',','')
      df3['营业收入(元)'][[i]]=eval(income[1])
      eps=[l for l in lines[4]]
      while '' in eps:
          eps.remove('')
      df3['基本每股收益(元/股)'][[i]]=eval(eps[1])
  df3.to_csv('公司基本信息1.csv')
  df4=df3.sort_values(by='营业收入(元)',ascending=False)
  df4=df4.reset_index()
  df4=df4.drop('index',axis=1)
  df4.to_csv('公司基本信息2.csv')
  df4=pd.read_csv('公司基本信息2.csv')
  #提取各公司十年数据
  df5=df4.iloc[:10,:]
  files=os.listdir(r'C:\Users\cch\Desktop\数据作业')
  list_co=[d for d in df5['股票简称']]
  list_rp=[f for f in files if f.endswith('.pdf') and f[:4] in list_co and '年度报告' in f]
  for c in list_co:
      # c=list_co[0]
      list_1=[f for f in list_rp if c in f]
      df6=pd.DataFrame(columns=['年份','营业收入(元)','基本每股收益(元/股)'],index=range(len(list_1)))
      for i in range(len(list_1)):
          # i=3
          y=list_1[i]
          # y[-10]
          if y[-10]=='年':
              year=y[-14:-9]
          else:
              year=y[-13:-8]
          df6['年份'][[i]]=year
          lines=parse_data_line(y)
          if '营业收入' in lines[0][0]:
              income=[l for l in lines[0]]
          elif '营业收入' in lines[1][0]:
              income=[l for l in lines[1]]
          elif '营业收入' in lines[2][0]:
              income=[l for l in lines[2]]
          while '' in income:
              income.remove('')
          while ',' in income[1]:
              income[1]=income[1].replace(',','')
          # while ',' in income[1]:
          #     income[1]=income[1].replace(',','')
          df6['营业收入(元)'][[i]]=eval(income[1])
          if '基本每股收益' in lines[4][0]:
              eps=[l for l in lines[4]]
          if '基本每股收益' in lines[5][0]:
              eps=[l for l in lines[5]]
          elif '基本每股收益' in lines[6][0]:
              eps=[l for l in lines[6]]
          elif '基本每股收益' in lines[7][0]:
              eps=[l for l in lines[7]]
          while '' in eps:
              eps.remove('')
          df6['基本每股收益(元/股)'][[i]]=eval(eps[1])
      df6.to_csv('%s十年数据.csv'%c)
      print(df6)
  #查看提取的数据
  for c in list_co:
      df_data=pd.read_csv('%s十年数据.csv'%c)
      print(df_data)

代码部分二


import matplotlib.pyplot as plt
import pandas as pd
plt.rcParams['font.sans-serif']=['SimHei']  #确保显示中文
plt.rcParams['axes.unicode_minus'] = False  #确保显示负数的参数设置
df4=pd.read_csv('公司基本信息2.csv')

list_co=[d for d in df4['股票简称']]
list_co=list_co[:10]
for c in list_co:
    # c=list_co[0]
    df_data=pd.read_csv('%s十年数据.csv'%c)
    print(c,df_data)
    df1=df_data.sort_values(by='年份')
    df1=df1.set_index('年份')
    # df1=df1.reset_index()
    # del df1['index']
    plt.rcParams['font.sans-serif']=['SimHei']  #确保显示中文
    plt.rcParams['axes.unicode_minus'] = False  #确保显示负数的参数设置
    plt.figure()
    plt.plot(df1.index,df1['营业收入(元)'],label=u'年营业收入',color='r')
    # for x,y in zip(df1.index,df1['营业收入(元)']):#显示bar数值
    #     plt.text(x,y,'%.3e'%y,ha='center',va='bottom')
    plt.xlabel(u'(年)',fontsize=13)
    plt.ylabel(u'营业收入(元)',fontsize=13,rotation=90)
    # plt.legend(loc='best')
    plt.title(u'%s%s-%s%s'%(c,df1.index[0],df1.index[-1],'营业收入'),fontsize=13)
    # plt.yticks(range(0,10**10,10**9))
    # plt.grid(True)
    plt.show()
    plt.figure()
    plt.plot(df1.index,df1['基本每股收益(元/股)'],label=u'基本每股收益',color='blue')
    plt.xlabel(u'(年)',fontsize=13)
    plt.ylabel(u'(元/股)',fontsize=13,rotation=90)
    plt.title(u'%s%s-%s%s'%(c,df1.index[0],df1.index[-1],'基本每股收益'),fontsize=13)
    #plt.savefig('c.jpg')
    plt.show()

#提取同年营业收入
c=list_co[0]
df_data=pd.read_csv('%s十年数据.csv'%c)
df1=df_data.sort_values(by='年份')
df1=df1.set_index('年份')
year=df1.index
df_income=pd.DataFrame(index=year)
for c in list_co:
    df_data=pd.read_csv('%s十年数据.csv'%c)
    df1=df_data.sort_values(by='年份')
    df1=df1.set_index('年份')
    df_income=pd.concat([df_income,df1['营业收入(元)']],axis=1)
df_income.columns=list_co
df_income=df_income.fillna(0)
#提取同年EPS
df_eps=pd.DataFrame(index=year)
for c in list_co:
    df_data=pd.read_csv('%s十年数据.csv'%c)
    df1=df_data.sort_values(by='年份')
    df1=df1.set_index('年份')
    df_eps=pd.concat([df_eps,df1['基本每股收益(元/股)']],axis=1)
df_eps.columns=list_co
df_eps=df_eps.fillna(0)


for c in range(10):
    plt.rcParams['font.sans-serif']=['SimHei']  #确保显示中文
    plt.rcParams['axes.unicode_minus'] = False  #确保显示负数的参数设置
    plt.figure()
    plt.bar(list_co,df_income.iloc[c,:],label=u'营业收入',color='r')
    # for x,y in zip(df1.index,df1['营业收入(元)']):#显示bar数值
    #     plt.text(x,y,'%.3e'%y,ha='center',va='bottom')
    plt.xlabel(u'公司名称',fontsize=13)
    plt.ylabel(u'营业收入(元)',fontsize=13,rotation=90)

    # plt.legend(loc='best')
    plt.title(u'%s'%df_income.index[c],fontsize=13)
    # plt.yticks(range(0,10**10,10**9))
    # plt.grid(True)
    plt.show()

    plt.rcParams['font.sans-serif']=['SimHei']  #确保显示中文
    plt.rcParams['axes.unicode_minus'] = False  #确保显示负数的参数设置
    plt.figure()
    plt.bar(list_co,df_eps.iloc[c,:],label=u'基本每股收益(元)',color='blue')
    # for x,y in zip(df1.index,df1['营业收入(元)']):#显示bar数值
    #     plt.text(x,y,'%.3e'%y,ha='center',va='bottom')
    plt.xlabel(u'公司名称',fontsize=13)
    plt.ylabel(u'基本每股收益(元)',fontsize=13,rotation=90)

    # plt.legend(loc='best')
    plt.title(u'%s'%df_eps.index[c],fontsize=13)
    # plt.yticks(range(0,10**10,10**9))
    # plt.grid(True)
    plt.show()

结果汇总

要求一:下载所分配行业各公司近10年年报,上市不足10年的,下载自上市年至2021年年报 结果1截图 要求二:提取“主要会计数据和财务指标”中的“营业收入(元)”、“基本每股收益(元 ╱ 股)” 将以上数据保存为本地'.csv'文件 结果2截图 要求三:提取“股票简称”、“股票代码”、“办公地址”、“公司网址”;将以上数据保存为本地'.csv'文件 结果3截图 要求四:绘制趋势图、对比图 结果4截图 结果5截图 结果6截图 结果6截图

实验心得

这次的实验可能是对我来说最大的一次挑战了,在代码部分以标注的形式写出了每个步骤的具体目标。通过此次的作业让我更 加了解有色金属冶炼这一行业的发展。也让我更加熟悉python数据爬取这一方面的知识。