洪恒的期末报告

STEP1:提取对应行业股票代码


import fitz
import re
import pandas as pd
import numpy as np
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import matplotlib.pyplot as plt
os.chdir('/Users/h/Desktop/python金融数据获取与处理/期末大作业')

import re
import pandas as pd


class DisclosureTable():
    '''
    解析巨潮网公告页搜索表格
    '''
    def __init__(self, innerHTML):
        self.html = innerHTML
        self.prefix = 'http://www.cninfo.com.cn'
        #
        p_code = re.compile('(.*?)', re.DOTALL)
        p_time = re.compile('(.*?)', re.DOTALL)
        p_name = re.compile('(.*?)', re.DOTALL)
        self.get_code = lambda txt: p_code.search(txt).group(1).strip()
        self.get_time = lambda txt: p_time.search(txt).group(1).strip()
        self.get_name = lambda txt: p_name.search(txt).group(1).strip()
        #
        self.txt_to_df()

    def txt_to_df(self):
        # html table text to DataFrame
        html = self.html
        p = re.compile('(.*?)', re.DOTALL)
        trs = p.findall(html)

        p2 = re.compile('(.*?)', re.DOTALL)
        tds = [p2.findall(tr) for tr in trs]

        df = pd.DataFrame({'证券代码': [td[0] for td in tds],
                           '简称': [td[1] for td in tds],
                           '公告标题': [td[2] for td in tds],
                           '公告时间': [td[3] for td in tds]})
        self.df_txt = df

    def get_link(self, txt):
        p_txt = '(.*?)'
        p = re.compile(p_txt, re.DOTALL)
        matchObj = p.search(txt)
        href = matchObj.group(1).strip()
        href = re.sub('amp;','',href)
        title = matchObj.group(2).strip()
        return([href, title])

    def get_data(self):
        get_code = self.get_code
        get_time = self.get_time
        get_link = self.get_link
        get_name = self.get_name
        #
        df = self.df_txt
        codes = [get_code(td) for td in df['证券代码']]
        short_names = [get_name(td) for td in df['简称']]
        ahts = [get_link(td) for td in df['公告标题']]
        times = [get_time(td) for td in df['公告时间']]
        #
        prefix = self.prefix
        df = pd.DataFrame({'证券代码': codes,
                           '简称': short_names,
                           '公告标题': [aht[1] for aht in ahts],
                           'href': [prefix + aht[0] for aht in ahts],
                           '公告时间': times
            })
        self.df_data = df
        return(df)

'''
f = open('innerHTML.html', encoding="utf-8")
html = f.read()
f.close()
dt = DisclosureTable(html)
df = dt.get_data()                        #提取信息
df                                        #获得结果
df.to_csv('data.csv')
'''



结果展示

行业分类中需要分析的住宿业上市公司

结果截图

STEP2:下载年报


  browser = webdriver.Safari()        #使用Safari浏览器
  browser.maximize_window()
  def get_cninfo(code):               #爬取巨潮网年报信息
      browser.get('http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&checkedCategory=category_ndbg_szsh')
      browser.find_element(By.CSS_SELECTOR, ".el-autocomplete > .el-input--medium > .el-input__inner").send_keys(code)
      time.sleep(1)
      browser.find_element(By.CSS_SELECTOR, ".query-btn").send_keys(Keys.DOWN)
      browser.find_element(By.CSS_SELECTOR, ".query-btn").send_keys(Keys.ENTER)
      time.sleep(2)
      browser.find_element(By.CSS_SELECTOR, ".el-range-input:nth-child(2)").click()
      time.sleep(0.5)
      browser.find_element(By.CSS_SELECTOR, ".el-range-input:nth-child(2)").clear()
      browser.find_element(By.CSS_SELECTOR, ".el-range-input:nth-child(2)").send_keys("2013-01-01")
      browser.find_element(By.CSS_SELECTOR, ".el-range-input:nth-child(2)").send_keys(Keys.ENTER)
      time.sleep(0.1)
      browser.find_element(By.CSS_SELECTOR, ".query-btn").click()
      time.sleep(2)
      element = browser.find_element(By.CLASS_NAME, 'el-table__body')
      innerHTML = element.get_attribute('innerHTML')
      return innerHTML

  def html_to_df(innerHTML):          #转换为Dataframe
      f = open('innerHTML.html','w',encoding='utf-8')          #创建html文件
      f.write(innerHTML)
      f.close()
      f = open('innerHTML.html', encoding="utf-8")
      html = f.read()
      f.close()
      dt = DisclosureTable(html)
      df = dt.get_data()
      return df

  df = pd.DataFrame()
  for i in code:
      innerHTML = get_cninfo(i)
      time.sleep(0.2)
      df = df.append(html_to_df(innerHTML))
      time.sleep(0.2)

      def filter_links(words,df0,include=True):
          ls = []
          for word in words:
              if include:
                  ls.append([word in f for f in df0['公告标题']])
              else:
                  ls.append([word not in f for f in df0['公告标题']])
          index = []
          for r in range(len(df0)):
              flag = not include
              for c in range(len(words)):
                  if include:
                      flag = flag or ls[c][r]
                  else:
                      flag = flag and ls[c][r]
              index.append(flag)
          df1 = df0[index]
          return(df1)

      words1 = ["摘要","已取消"]
      list = filter_links(words1,df,include=False)    #删除摘要和已取消的报告
      fun1 = lambda x: re.sub('(?<=报告).*', '', x)
      fun2 = lambda x: re.sub('.*(?=\d{4})', '', x)
      list['公告标题'] = list['公告标题'].apply(fun1)    #去除“20xx年度报告”前后内容
      list['公告标题'] = list['公告标题'].apply(fun2)
      #exception = list[~list['公告标题'].str.contains('年年度报告')]
      list = list.drop_duplicates(['证券代码','公告标题','公告时间'], keep='first')  #删去重复值,保留最新一项
      list['公告标题'].iloc[-20] = '2021年年度报告'  #以下两行为修改格式不同一的公告标题
      list['公告标题'].iloc[-19] = '2020年年度报告'
      list['年份'] = [re.search('\d{4}', title).group() for title in list['公告标题']]
      list['公告标题'] = list['简称']+list['公告标题']

      os.makedirs('files')
      os.chdir('/Users/h/Desktop/python金融数据获取与处理/期末大作业/files')
      def get_pdf(r):             #构建下载巨潮网报告pdf函数
          p_id = re.compile('.*var announcementId = "(.*)";.*var announcementTime = "(.*?)"',re.DOTALL)
          contents = r.text
          a_id = re.findall(p_id, contents)
          new_url = "http://static.cninfo.com.cn/finalpage/" + a_id[0][1] + '/' + a_id[0][0] + ".PDF"
          result = requests.get(new_url, allow_redirects=True)
          time.sleep(1)
          return result
      for c in code:
          rpts = list[list['证券代码']==c]
          for row in range(len(rpts)):
              r = requests.get(rpts.iloc[row,3], allow_redirects=True)
              time.sleep(0.3)
              try:
                  result = get_pdf(r)
                  f = open(rpts.iloc[row,2]+'.PDF', 'wb')
                  f.write(result.content)
                  f.close()
                  r.close()
              except:
                  print(rpts.iloc[row,2])
                  pass

结果展示

经过筛选和过滤后,list中含有住宿业5家公司所有公告标题统一为“公司简称+20xx年年度报告”的从2012年至2021年的年报

结果截图 结果截图

下载年报,放入相应文件夹

结果截图 结果截图 结果截图 结果截图 结果截图

STEP3:解析年报数据,从上市公司年报中提取营业收入、基本每股收益数据,并保存为csv文件


  def get_data(rpt):         #构建获取营业收入和每股收益数据的函数
      text = ''
      for page in rpt:
          text += page.get_text()
      p_s = re.compile('(?<=\\n)[\D、]?\D*?主要\D*?数据和\D*?(?=\\n)(.*?)稀', re.DOTALL)
      txt =  p_s.search(text).group(0)                              #匹配对应内容
      p1 = re.compile('营(.*?)归',re.DOTALL)                #匹配年报中3年的营业收入
      data = p1.search(txt).group()
      data = data.replace('\n', '')                                 #替换掉换行符
      p_digit = re.compile(r'(-)?\d[,0-9]*?\.\d{1,2}')              #匹配内容中的数字到小数点后2位
      revenue = p_digit.search(data).group()
      revenue = turnover.replace(',','')                           #去掉逗号
      p2 = re.compile('基(.*?)稀',re.DOTALL)               #匹配年报中3年的基本每股收益
      data = p2.search(txt).group()
      data = data.replace('\n', '')
      pe = p_digit.search(data).group()
      return revenue,pe

  def get_information(rpt):   #构建获取办公地址、公司网址等信息的函数
      text = ''
      for page in rpt:
          text += page.get_text()
      p1 = re.compile('(?<=\\n)\w*办公地址:?\s?\n?(.*?)\s?(?=\\n)', re.DOTALL)
      infom1 = p1.findall(text)[0]
      p2 = re.compile('(?<=\n)公司\w*网\s?址:?\s?\n?(.*?)\s?(?=\n)', re.DOTALL)
      infom2 = p2.findall(text)[0]
      return infom1,infom2


  #获取营业收入和每股收益数据
  revenues = pd.DataFrame(columns=['公司'] + [year for year in range(2012,2022)])
  pes = pd.DataFrame(columns=['公司'] + [year for year in range(2012,2022)])
  for i in range(len(code)):
      firm = list[list['证券代码']==code[i]]
      revenues.loc[i,'公司'] = firm.iloc[0,1]
      pes.loc[i,'公司'] = firm.iloc[0,1]
      for item in range(len(firm)):
          try:
              rpt = fitz.open(firm.iloc[item,2]+'.PDF')
              revenue, pe = get_data(rpt)
              reveues[int(firm.iloc[item,-1])][i] = revenue
              pes[int(firm.iloc[item,-1])][i] = pe
          except:
              print(firm.iloc[item,2]+'解析出错')

  revenues_n = revenues.iloc[:,1:].astype('float')
  revenues_n.index = revenues['公司']
  revenues_n.to_csv('营业收入汇总.csv')
  pes_n = pes.iloc[:,1:].astype('float')
  pes_n.index = pes['公司']
  pes_n.loc['*ST东海A',2012] = 0.0058  #因*ST大东海A公司其基本每股收益在取小数点两位后很多数值为0,故这里手动敲入其基本每股收益
  pes_n.loc['*ST东海A',2013] = -0.006
  pes_n.loc['*ST东海A',2014] = 0.0069
  pes_n.loc['*ST东海A',2015] = -0.0205
  pes_n.loc['*ST东海A',2016] = -0.0073
  pes_n.loc['*ST东海A',2017] = 0.0079
  pes_n.loc['*ST东海A',2018] = 0.0018
  pes_n.loc['*ST东海A',2019] = 0.0021
  pes_n.loc['*ST东海A',2020] = -0.0318
  pes_n.loc['*ST东海A',2021] = -0.0055
  pes_n.to_csv('每股收益汇总.csv')

  #获取公司信息
  firm = list[list['证券代码']==code[0]]
  rpt = fitz.open(firm.iloc[firm['年份'].argsort().iloc[-1],2]+'.PDF')
  info = pd.DataFrame(columns=['股票代码', '股票简称', '办公地址', '公司网址'])
  for i in range(len(code)):
      firm = list[list['证券代码']==code[i]]
      try:
          rpt = fitz.open(firm.iloc[firm['年份'].argsort().iloc[-1],2]+'.PDF')
          info1,info2 = get_information(rpt)
          info.loc[i,'股票代码'] = firm.iloc[0,0]
          info.loc[i,'股票简称'] = firm.iloc[0,1]
          info.loc[i,'办公地址'] = info1
          info.loc[i,'公司网址'] = info2
      except:
          print(firm.iloc[firm['年份'].argsort().iloc[-1],2]+'解析出错')

  info.to_csv('公司信息.csv')

一些碰到的问题与解决方案

  1. 锦江股份有3年报告公告标题相同,可通过drop_duplicates函数通过条件删去同一年中相同的一个,再用iloc定位改其标题
  2. 因*ST大东海A公司其基本每股收益在取小数点两位后很多数值为0,故这里手动敲入其基本每股收益

问题1

结果截图

问题2

结果截图

结果展示

营业收入

结果截图

详细内容可通过以下链接查看:

  • 营业收入汇总.csv
  • 基本每股收益

    结果截图

    详细内容可通过以下链接查看:

  • 每股收益汇总.csv
  • 公司信息

    结果截图

    详细内容可通过以下链接查看:

  • 公司信息.csv
  • STEP4:绘制图表并分析

    
      #绘制营业收入变化趋势图表
      figure1 = revenues_n
      figure1['公司简称'] = revenues_n.index
      figure1.index = [i for i in range(len(figure1))]
      figure1['mean'] = figure1.iloc[:,:10].apply(lambda x: x.sum()/10, axis=1)
      figure1 = figure1.sort_values('mean', ascending=False)[:10]
      figure1.iloc[:,:10] = figure1.iloc[:,:10]/100000000
    
      import matplotlib.font_manager as fm
      fname = "/System/Library/Fonts/STHeiti Light.ttc"
      zhfont1 = fm.FontProperties(fname=fname)
      i = 0
      plt.plot(figure1.columns[:10], figure1.iloc[i,:10], marker='o')
      plt.xticks(np.linspace(2012,2021,10))
      plt.xlabel('年份',fontsize=13,fontproperties=zhfont1)
      plt.ylabel('营业收入(亿元)',fontsize=11,fontproperties=zhfont1)
      plt.title(figure1.iloc[i,10]+"营业收入折线图",fontsize=14,fontproperties=zhfont1)
      plt.show()
    
      #绘制每股收益变化趋势图表
      figure2 = pes_n
      figure2['公司简称'] = pes_n.index
      figure2.index = [i for i in range(len(figure2))]
      figure2['mean'] = figure2.iloc[:,:10].apply(lambda x: x.sum()/10, axis=1)
      figure2 = figure2.sort_values('mean', ascending=False)[:10]
    
      import matplotlib.font_manager as fm
      fname = "/System/Library/Fonts/STHeiti Light.ttc"
      zhfont1 = fm.FontProperties(fname=fname)
      i = 4
      plt.plot(figure2.columns[:10], figure2.iloc[i,:10], marker='o')
      plt.xticks(np.linspace(2012,2021,10))
      plt.xlabel('年份',fontsize=13,fontproperties=zhfont1)
      plt.ylabel('基本每股收益(元/股)',fontsize=11,fontproperties=zhfont1)
      plt.title(figure2.iloc[i,10]+"基本每股收益折线图",fontsize=14,fontproperties=zhfont1)
      plt.show()
    
      #绘制逐年营业收入和每股收益图表
      import matplotlib.font_manager as fm
      fname = "/System/Library/Fonts/STHeiti Light.ttc"
      zhfont1 = fm.FontProperties(fname=fname)
      year = 2021
      item = pd.concat([revenues_n[year], revenues_n['公司简称']], axis=1)
      item[year] = item[year]/100000000
      item = item.sort_values(year, ascending=False).iloc[:10]
      plt.bar(item['公司简称'],height=item[year],width=0.2)
      plt.title(str(year)+'年营业收入分布柱状图',fontproperties=zhfont1)
      plt.ylabel('营业收入(亿元)',fontsize=11,fontproperties=zhfont1)
      plt.xticks(rotation=45,fontproperties=zhfont1)
      plt.show()
    
      year = 2021
      item = pd.concat([pes_n[year], pes_n['公司简称']], axis=1)
      item[year] = item[year]
      item = item.sort_values(year, ascending=False).iloc[:10]
      plt.bar(item['公司简称'],height=item[year],width=0.2)
      plt.title(str(year)+'基本每股收益分布柱状图',fontproperties=zhfont1)
      plt.ylabel('基本每股收益(元/股)',fontsize=11,fontproperties=zhfont1)
      plt.xticks(rotation=45,fontproperties=zhfont1)
      plt.show()
    
    
    

    结果展示

    上市公司营业收入随时间变化趋势图(共5家)

    结果截图 结果截图 结果截图 结果截图 结果截图

    上市公司基本每股收益随时间变化趋势图(共5家)

    结果截图 结果截图 结果截图 结果截图 结果截图

    上市公司营业收入横向对比图

    结果截图 结果截图 结果截图 结果截图 结果截图 结果截图 结果截图 结果截图 结果截图 结果截图

    上市公司基本每股收益横向对比图

    结果截图 结果截图 结果截图 结果截图 结果截图 结果截图 结果截图 结果截图 结果截图 结果截图

    住宿业业解读与分析

    写在最后

    经过本次期末作业的练习,我真正做到了独立完成运用网络爬虫、正则表达式、python不同库来进行行业分析。从提取网页表格数据,到获取年报数据并绘图,每一步在做的过程中都会出现各种各样的问题,在做的过程中可能会感觉比较痛苦,但在查阅资料、与老师交流、和同学们讨论后逐一解决这些问题,现在回头再看,才发现充满了成就感。 最后感谢老师这一个学期以来的指导,这门课程的内容非常充实,是一门集理论与实践于一体的不可多得的好课,希望未来这门课能越开越好。最后,祝老师工作顺利,身体健康!