洪恒的期末报告

STEP1:提取对应行业股票代码


import fitz
import re
import pandas as pd
import numpy as np
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import matplotlib.pyplot as plt
os.chdir('/Users/h/Desktop/python金融数据获取与处理/期末大作业')

import re
import pandas as pd


class DisclosureTable():
    '''
    解析巨潮网公告页搜索表格
    '''
    def __init__(self, innerHTML):
        self.html = innerHTML
        self.prefix = 'http://www.cninfo.com.cn'
        #
        p_code = re.compile('(.*?)', re.DOTALL)
        p_time = re.compile('(.*?)', re.DOTALL)
        p_name = re.compile('(.*?)', re.DOTALL)
        self.get_code = lambda txt: p_code.search(txt).group(1).strip()
        self.get_time = lambda txt: p_time.search(txt).group(1).strip()
        self.get_name = lambda txt: p_name.search(txt).group(1).strip()
        #
        self.txt_to_df()

    def txt_to_df(self):
        # html table text to DataFrame
        html = self.html
        p = re.compile('(.*?)', re.DOTALL)
        trs = p.findall(html)

        p2 = re.compile('(.*?)', re.DOTALL)
        tds = [p2.findall(tr) for tr in trs]

        df = pd.DataFrame({'证券代码': [td[0] for td in tds],
                           '简称': [td[1] for td in tds],
                           '公告标题': [td[2] for td in tds],
                           '公告时间': [td[3] for td in tds]})
        self.df_txt = df

    def get_link(self, txt):
        p_txt = '(.*?)'
        p = re.compile(p_txt, re.DOTALL)
        matchObj = p.search(txt)
        href = matchObj.group(1).strip()
        href = re.sub('amp;','',href)
        title = matchObj.group(2).strip()
        return([href, title])

    def get_data(self):
        get_code = self.get_code
        get_time = self.get_time
        get_link = self.get_link
        get_name = self.get_name
        #
        df = self.df_txt
        codes = [get_code(td) for td in df['证券代码']]
        short_names = [get_name(td) for td in df['简称']]
        ahts = [get_link(td) for td in df['公告标题']]
        times = [get_time(td) for td in df['公告时间']]
        #
        prefix = self.prefix
        df = pd.DataFrame({'证券代码': codes,
                           '简称': short_names,
                           '公告标题': [aht[1] for aht in ahts],
                           'href': [prefix + aht[0] for aht in ahts],
                           '公告时间': times
            })
        self.df_data = df
        return(df)

'''
f = open('innerHTML.html', encoding="utf-8")
html = f.read()
f.close()
dt = DisclosureTable(html)
df = dt.get_data()                        #提取信息
df                                        #获得结果
df.to_csv('data.csv')
'''

结果展示

行业分类中需要分析的住宿业上市公司

STEP2:下载年报


  browser = webdriver.Safari()        #使用Safari浏览器
  browser.maximize_window()
  def get_cninfo(code):               #爬取巨潮网年报信息
      browser.get('http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&checkedCategory=category_ndbg_szsh')
      browser.find_element(By.CSS_SELECTOR, ".el-autocomplete > .el-input--medium > .el-input__inner").send_keys(code)
      time.sleep(1)
      browser.find_element(By.CSS_SELECTOR, ".query-btn").send_keys(Keys.DOWN)
      browser.find_element(By.CSS_SELECTOR, ".query-btn").send_keys(Keys.ENTER)
      time.sleep(2)
      browser.find_element(By.CSS_SELECTOR, ".el-range-input:nth-child(2)").click()
      time.sleep(0.5)
      browser.find_element(By.CSS_SELECTOR, ".el-range-input:nth-child(2)").clear()
      browser.find_element(By.CSS_SELECTOR, ".el-range-input:nth-child(2)").send_keys("2013-01-01")
      browser.find_element(By.CSS_SELECTOR, ".el-range-input:nth-child(2)").send_keys(Keys.ENTER)
      time.sleep(0.1)
      browser.find_element(By.CSS_SELECTOR, ".query-btn").click()
      time.sleep(2)
      element = browser.find_element(By.CLASS_NAME, 'el-table__body')
      innerHTML = element.get_attribute('innerHTML')
      return innerHTML

  def html_to_df(innerHTML):          #转换为Dataframe
      f = open('innerHTML.html','w',encoding='utf-8')          #创建html文件
      f.write(innerHTML)
      f.close()
      f = open('innerHTML.html', encoding="utf-8")
      html = f.read()
      f.close()
      dt = DisclosureTable(html)
      df = dt.get_data()
      return df

  df = pd.DataFrame()
  for i in code:
      innerHTML = get_cninfo(i)
      time.sleep(0.2)
      df = df.append(html_to_df(innerHTML))
      time.sleep(0.2)

      def filter_links(words,df0,include=True):
          ls = []
          for word in words:
              if include:
                  ls.append([word in f for f in df0['公告标题']])
              else:
                  ls.append([word not in f for f in df0['公告标题']])
          index = []
          for r in range(len(df0)):
              flag = not include
              for c in range(len(words)):
                  if include:
                      flag = flag or ls[c][r]
                  else:
                      flag = flag and ls[c][r]
              index.append(flag)
          df1 = df0[index]
          return(df1)

      words1 = ["摘要","已取消"]
      list = filter_links(words1,df,include=False)    #删除摘要和已取消的报告
      fun1 = lambda x: re.sub('(?<=报告).*', '', x)
      fun2 = lambda x: re.sub('.*(?=\d{4})', '', x)
      list['公告标题'] = list['公告标题'].apply(fun1)    #去除“20xx年度报告”前后内容
      list['公告标题'] = list['公告标题'].apply(fun2)
      #exception = list[~list['公告标题'].str.contains('年年度报告')]
      list = list.drop_duplicates(['证券代码','公告标题','公告时间'], keep='first')  #删去重复值，保留最新一项
      list['公告标题'].iloc[-20] = '2021年年度报告'  #以下两行为修改格式不同一的公告标题
      list['公告标题'].iloc[-19] = '2020年年度报告'
      list['年份'] = [re.search('\d{4}', title).group() for title in list['公告标题']]
      list['公告标题'] = list['简称']+list['公告标题']

      os.makedirs('files')
      os.chdir('/Users/h/Desktop/python金融数据获取与处理/期末大作业/files')
      def get_pdf(r):             #构建下载巨潮网报告pdf函数
          p_id = re.compile('.*var announcementId = "(.*)";.*var announcementTime = "(.*?)"',re.DOTALL)
          contents = r.text
          a_id = re.findall(p_id, contents)
          new_url = "http://static.cninfo.com.cn/finalpage/" + a_id[0][1] + '/' + a_id[0][0] + ".PDF"
          result = requests.get(new_url, allow_redirects=True)
          time.sleep(1)
          return result
      for c in code:
          rpts = list[list['证券代码']==c]
          for row in range(len(rpts)):
              r = requests.get(rpts.iloc[row,3], allow_redirects=True)
              time.sleep(0.3)
              try:
                  result = get_pdf(r)
                  f = open(rpts.iloc[row,2]+'.PDF', 'wb')
                  f.write(result.content)
                  f.close()
                  r.close()
              except:
                  print(rpts.iloc[row,2])
                  pass

结果展示

经过筛选和过滤后，list中含有住宿业5家公司所有公告标题统一为“公司简称+20xx年年度报告”的从2012年至2021年的年报

下载年报，放入相应文件夹

STEP3：解析年报数据，从上市公司年报中提取营业收入、基本每股收益数据，并保存为csv文件


  def get_data(rpt):         #构建获取营业收入和每股收益数据的函数
      text = ''
      for page in rpt:
          text += page.get_text()
      p_s = re.compile('(?<=\\n)[\D、]?\D*?主要\D*?数据和\D*?(?=\\n)(.*?)稀', re.DOTALL)
      txt =  p_s.search(text).group(0)                              #匹配对应内容
      p1 = re.compile('营(.*?)归',re.DOTALL)                #匹配年报中3年的营业收入
      data = p1.search(txt).group()
      data = data.replace('\n', '')                                 #替换掉换行符
      p_digit = re.compile(r'(-)?\d[,0-9]*?\.\d{1,2}')              #匹配内容中的数字到小数点后2位
      revenue = p_digit.search(data).group()
      revenue = turnover.replace(',','')                           #去掉逗号
      p2 = re.compile('基(.*?)稀',re.DOTALL)               #匹配年报中3年的基本每股收益
      data = p2.search(txt).group()
      data = data.replace('\n', '')
      pe = p_digit.search(data).group()
      return revenue,pe

  def get_information(rpt):   #构建获取办公地址、公司网址等信息的函数
      text = ''
      for page in rpt:
          text += page.get_text()
      p1 = re.compile('(?<=\\n)\w*办公地址：?\s?\n?(.*?)\s?(?=\\n)', re.DOTALL)
      infom1 = p1.findall(text)[0]
      p2 = re.compile('(?<=\n)公司\w*网\s?址：?\s?\n?(.*?)\s?(?=\n)', re.DOTALL)
      infom2 = p2.findall(text)[0]
      return infom1,infom2


  #获取营业收入和每股收益数据
  revenues = pd.DataFrame(columns=['公司'] + [year for year in range(2012,2022)])
  pes = pd.DataFrame(columns=['公司'] + [year for year in range(2012,2022)])
  for i in range(len(code)):
      firm = list[list['证券代码']==code[i]]
      revenues.loc[i,'公司'] = firm.iloc[0,1]
      pes.loc[i,'公司'] = firm.iloc[0,1]
      for item in range(len(firm)):
          try:
              rpt = fitz.open(firm.iloc[item,2]+'.PDF')
              revenue, pe = get_data(rpt)
              reveues[int(firm.iloc[item,-1])][i] = revenue
              pes[int(firm.iloc[item,-1])][i] = pe
          except:
              print(firm.iloc[item,2]+'解析出错')

  revenues_n = revenues.iloc[:,1:].astype('float')
  revenues_n.index = revenues['公司']
  revenues_n.to_csv('营业收入汇总.csv')
  pes_n = pes.iloc[:,1:].astype('float')
  pes_n.index = pes['公司']
  pes_n.loc['*ST东海A',2012] = 0.0058  #因*ST大东海A公司其基本每股收益在取小数点两位后很多数值为0，故这里手动敲入其基本每股收益
  pes_n.loc['*ST东海A',2013] = -0.006
  pes_n.loc['*ST东海A',2014] = 0.0069
  pes_n.loc['*ST东海A',2015] = -0.0205
  pes_n.loc['*ST东海A',2016] = -0.0073
  pes_n.loc['*ST东海A',2017] = 0.0079
  pes_n.loc['*ST东海A',2018] = 0.0018
  pes_n.loc['*ST东海A',2019] = 0.0021
  pes_n.loc['*ST东海A',2020] = -0.0318
  pes_n.loc['*ST东海A',2021] = -0.0055
  pes_n.to_csv('每股收益汇总.csv')

  #获取公司信息
  firm = list[list['证券代码']==code[0]]
  rpt = fitz.open(firm.iloc[firm['年份'].argsort().iloc[-1],2]+'.PDF')
  info = pd.DataFrame(columns=['股票代码', '股票简称', '办公地址', '公司网址'])
  for i in range(len(code)):
      firm = list[list['证券代码']==code[i]]
      try:
          rpt = fitz.open(firm.iloc[firm['年份'].argsort().iloc[-1],2]+'.PDF')
          info1,info2 = get_information(rpt)
          info.loc[i,'股票代码'] = firm.iloc[0,0]
          info.loc[i,'股票简称'] = firm.iloc[0,1]
          info.loc[i,'办公地址'] = info1
          info.loc[i,'公司网址'] = info2
      except:
          print(firm.iloc[firm['年份'].argsort().iloc[-1],2]+'解析出错')

  info.to_csv('公司信息.csv')

一些碰到的问题与解决方案

锦江股份有3年报告公告标题相同，可通过drop_duplicates函数通过条件删去同一年中相同的一个，再用iloc定位改其标题
因*ST大东海A公司其基本每股收益在取小数点两位后很多数值为0，故这里手动敲入其基本每股收益

问题1

问题2

结果展示

营业收入

详细内容可通过以下链接查看：

营业收入汇总.csv

基本每股收益

详细内容可通过以下链接查看：

每股收益汇总.csv

公司信息

详细内容可通过以下链接查看：

公司信息.csv

STEP4:绘制图表并分析


  #绘制营业收入变化趋势图表
  figure1 = revenues_n
  figure1['公司简称'] = revenues_n.index
  figure1.index = [i for i in range(len(figure1))]
  figure1['mean'] = figure1.iloc[:,:10].apply(lambda x: x.sum()/10, axis=1)
  figure1 = figure1.sort_values('mean', ascending=False)[:10]
  figure1.iloc[:,:10] = figure1.iloc[:,:10]/100000000

  import matplotlib.font_manager as fm
  fname = "/System/Library/Fonts/STHeiti Light.ttc"
  zhfont1 = fm.FontProperties(fname=fname)
  i = 0
  plt.plot(figure1.columns[:10], figure1.iloc[i,:10], marker='o')
  plt.xticks(np.linspace(2012,2021,10))
  plt.xlabel('年份',fontsize=13,fontproperties=zhfont1)
  plt.ylabel('营业收入（亿元）',fontsize=11,fontproperties=zhfont1)
  plt.title(figure1.iloc[i,10]+"营业收入折线图",fontsize=14,fontproperties=zhfont1)
  plt.show()

  #绘制每股收益变化趋势图表
  figure2 = pes_n
  figure2['公司简称'] = pes_n.index
  figure2.index = [i for i in range(len(figure2))]
  figure2['mean'] = figure2.iloc[:,:10].apply(lambda x: x.sum()/10, axis=1)
  figure2 = figure2.sort_values('mean', ascending=False)[:10]

  import matplotlib.font_manager as fm
  fname = "/System/Library/Fonts/STHeiti Light.ttc"
  zhfont1 = fm.FontProperties(fname=fname)
  i = 4
  plt.plot(figure2.columns[:10], figure2.iloc[i,:10], marker='o')
  plt.xticks(np.linspace(2012,2021,10))
  plt.xlabel('年份',fontsize=13,fontproperties=zhfont1)
  plt.ylabel('基本每股收益（元/股）',fontsize=11,fontproperties=zhfont1)
  plt.title(figure2.iloc[i,10]+"基本每股收益折线图",fontsize=14,fontproperties=zhfont1)
  plt.show()

  #绘制逐年营业收入和每股收益图表
  import matplotlib.font_manager as fm
  fname = "/System/Library/Fonts/STHeiti Light.ttc"
  zhfont1 = fm.FontProperties(fname=fname)
  year = 2021
  item = pd.concat([revenues_n[year], revenues_n['公司简称']], axis=1)
  item[year] = item[year]/100000000
  item = item.sort_values(year, ascending=False).iloc[:10]
  plt.bar(item['公司简称'],height=item[year],width=0.2)
  plt.title(str(year)+'年营业收入分布柱状图',fontproperties=zhfont1)
  plt.ylabel('营业收入（亿元）',fontsize=11,fontproperties=zhfont1)
  plt.xticks(rotation=45,fontproperties=zhfont1)
  plt.show()

  year = 2021
  item = pd.concat([pes_n[year], pes_n['公司简称']], axis=1)
  item[year] = item[year]
  item = item.sort_values(year, ascending=False).iloc[:10]
  plt.bar(item['公司简称'],height=item[year],width=0.2)
  plt.title(str(year)+'基本每股收益分布柱状图',fontproperties=zhfont1)
  plt.ylabel('基本每股收益(元/股）',fontsize=11,fontproperties=zhfont1)
  plt.xticks(rotation=45,fontproperties=zhfont1)
  plt.show()

结果展示

上市公司营业收入随时间变化趋势图（共5家）

上市公司基本每股收益随时间变化趋势图（共5家）

上市公司营业收入横向对比图

上市公司基本每股收益横向对比图

住宿业业解读与分析

纵向分析

从营业收入来看，住宿业在2020年之前的表现参差不齐。华天酒店、*ST东海A、首旅酒店三家公司波动较大；而锦江酒店、金陵饭店两家公司表现稳中向好。这主要得益于我国GDP的稳步增长，居民消费水平的不断提高。从基本每股收益来看，5家上市公司均表现出大幅波动，但其基本每股收益仅有1家超过1.0，其余均在0-1之间，甚至有些年份基本每股收益出现负数。这说明住宿业并非高收益行业。
随着时间进入2020年，年初爆发新冠疫情，住宿业收到大幅打击，营业收入和基本每股收益大幅下降，可以看到2020年5家上市公司营业收入和基本每股收益都出现了大幅的下跌。
随着疫情得到有效控制，国内经济复苏，各行各业开始积极相应复工复产，业内各上市公司的营业状况也有所好转，营业收入和基本每股收益都有一定程度的上升，数家公司的基本每股收益都从2020的负数转为2021年的正数。住宿业发展稳中向好。上市公司大多入不敷出。虽然疫情得到了有效控制，但航空运输业要恢复到疫情之前的水平，还是需要一定的时间。

横向分析

从行业横向对比来看，锦江酒店、首旅酒店两大龙头的营业收入遥遥领先，龙头地位十分稳固，两家公司几乎占据了80%的市场份额。对比基本每股收益，情况也基本类似。锦江酒店、首旅酒店两大龙头的基本每股收益逐年递增，从2012年的0.6、0.5到2019年的1.2、1.0左右，之后受疫情影响，所有公司的基本每股收益骤减，即使是行业龙头，也只有0.1%左右。但从基本每股收益横向对比图来看，华天酒店和*ST东海A在疫情尚未爆发的2019年之前也出现过负数，这值得公司管理者反思公司可能存在内部经营风险。

总的来说，住宿业内部的市场竞争格局还是相对稳定的，基本上以锦江酒店、首旅酒店两大龙头为主导，其他公司仅占较小的市场份额。

写在最后

经过本次期末作业的练习，我真正做到了独立完成运用网络爬虫、正则表达式、python不同库来进行行业分析。从提取网页表格数据，到获取年报数据并绘图，每一步在做的过程中都会出现各种各样的问题，在做的过程中可能会感觉比较痛苦，但在查阅资料、与老师交流、和同学们讨论后逐一解决这些问题，现在回头再看，才发现充满了成就感。最后感谢老师这一个学期以来的指导，这门课程的内容非常充实，是一门集理论与实践于一体的不可多得的好课，希望未来这门课能越开越好。最后，祝老师工作顺利，身体健康！