刘燕飞的实验报告

结果

解释

由图表可以看出，凤凰传媒（代码：601928）在新闻和出版行业中领先于其他企业。近年来受疫情影响，营业收入以及归属于上市公司的净利润有所下降，十年间行业整体呈下降趋势。近两年来，新闻和出版行业经历了一系列的发展和变革，鉴于近几年营业收入的下降，新闻行业应该加快数字化的步伐,实现媒体多元化,关注数据新闻的兴起,加快出版行业的数字化转型，尽快实现营业收入上升的趋势。
代码



    '''1.从网站获取年报链接'''
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.common.keys import Keys
    import time
    import re
    import pandas as pd
    
    Codes=['601811','601858','601900','601921','601928','601949','601999','603096','603999','605577']
    def get_table_sse(code): 
        browser = webdriver.Edge()
        url='http://www.sse.com.cn/disclosure/listedinfo/regular/'
        browser.get(url)
        browser.set_window_size(1550, 830)
        time.sleep(3)
        browser.find_element(By.ID, "inputCode").click()
        browser.find_element(By.ID, "inputCode").send_keys(code)  #'601919'
        time.sleep(3)
        selector='.sse_outerItem:nth-child(4) .filter-option-inner-inner'  #???
        browser.find_element(By.CSS_SELECTOR,selector).click()
        browser.find_element(By.LINK_TEXT, "年报").click()
        time.sleep(3)
        selector = "body > div.container.sse_content > div > "
        selector += "div.col-lg-9.col-xxl-10 > div > "
        selector += "div.sse_colContent.js_regular > "
        selector += "div.table-responsive > table"
        element = browser.find_element(By.CSS_SELECTOR,selector)
        table_html = element.get_attribute('innerHTML')
        fname=f'{code}.html'
        f = open(fname,'w',encoding='utf-8')
        f.write(table_html)
        f.close()
        browser.quit()
    
    def get_table_sse_codes(codes):
        for code in codes:
            get_table_sse(code)
    
    def get_data(tr):
        p_td = re.compile('(.*?)', re.DOTALL)
        tds = p_td.findall(tr)
        s = tds[0].find('>') + 1
        e = tds[0].rfind('<')
        code = tds[0][s:e]
        s = tds[1].find('>') + 1
        e = tds[1].rfind('<')
        name = tds[1][s:e]
        s = tds[2].find('href="') + 6
        e = tds[2].find('.pdf"') + 4
        href = 'http://www.sse.com.cn' + tds[2][s:e]
        s = tds[2].find('$(this))">') + 10
        e = tds[2].find('')
        title = tds[2][s:e]
        date = tds[3].strip()
        data = [code,name,href,title,date]
        return(data)
    
    def parse_table(fname,save=True):  
        f=open(fname,encoding='utf-8')
        html=f.read()
        f.close()
        p = re.compile('(.+?)', re.DOTALL)
        trs = p.findall(html)
        trs_new = []
        for tr in trs:
            if tr.strip() != '':
                trs_new.append(tr)
        data_all = [get_data(tr) for tr in trs_new[1:]]
        df = pd.DataFrame({
            'code': [d[0] for d in data_all],
            'name': [d[1] for d in data_all],
            'href': [d[2] for d in data_all],
            'title': [d[3] for d in data_all],
            'date': [d[4] for d in data_all]
            })
        if save:
            df.to_csv(f'{fname[0:-5]}.csv')
        return(df)
    
    get_table_sse_codes(Codes)
    df1=parse_table('601811.html')
    df2=parse_table('601858.html')
    df3=parse_table('601900.html')
    df4=parse_table('601921.html')
    df5=parse_table('601928.html')
    df6=parse_table('601949.html')
    df7=parse_table('601999.html')
    df8=parse_table('603096.html')
    df9=parse_table('603999.html')
    df10=parse_table('605577.html')
    
    '''2.过滤掉一些不重要的的公告链接'''
    import time
    def  filter_links(words,df,include=True):
        ls=[]
        for word in words:
            if include:
                ls.append([word in f for f in df['title']])
            else:
                ls.append([word not in f for f in df['title']])
        index=[]
        for r in range(len(df)):
            flag = not include
            for c in range(len(words)):
                if include:
                    flag = flag or ls[c][r]
                else:
                    flag = flag and ls[c][r]
            index.append(flag)
        df2 = df[index]
        return(df2)
    
    def filter_date(start,end,df):
        date = df['date']
        v = [d>=start and d<= end for d in date]
        df_new = df[v]
        return(df_new)
    
    import datetime
    def start_end_10y():
        dt_now = datetime.datetime.now()
        current_year = dt_now.year
        start = f'{current_year-9}-01-01'
        end = f'{current_year}-12-31'
        return((start,end))
                                                                                          
    def filter_nb_10y(df,
                      keep_words=['年报','年度报告'],
                      exclude_words=['摘要','修订稿','持续督导'],
                      start=''):
        if start == '':
             start,end = start_end_10y()
        else:
            start_y = int(start[0:4])
            end = f'{start_y + 9}-12-31'   
        df = filter_links(keep_words, df,include=True)
        df = filter_links(exclude_words, df,include=False)
        df = filter_date(start,end,df)
        return(df)                                                                                                                                              
    
    df_all=[df1,df2,df3,df4,df5,df6,df7,df8,df9,df10]
    df_all_n=[]
    for i in df_all:
        df_all_n.append(filter_nb_10y(i,
                      keep_words=['年报','年度报告'],
                      exclude_words=['摘要','修订稿','持续督导'],
                      start=''))
    
    '''3.下载年报'''
    import requests
    def download_pdf(href, code, year):
        r = requests.get(href,allow_redirects=True)
        fname = f'{code}_{year}.pdf'
        f = open(fname,'wb')
        f.write(r.content)
        f.close
        r.close
       
    def download_pdfs(hrefs,code,years):
        for i in range(len(hrefs)):
            href = hrefs[i]
            year = years[i]
            download_pdf(href,code,year)
            time.sleep(30)
        return()
    
    def download_pdfs_codes(list_hrefs,codes,list_years):
        for i in range(len(list_hrefs)):
            hrefs = list_hrefs[i]
            years = list_years[i]
            code = codes[i]
            download_pdfs(hrefs, code, years)
        return()
    
    hrefs=[]
    for i in range(10):
        hrefs.append(list(df_all_n[i]['href']))
    years=[]
    for i in range(10):
        years.append(list(df_all_n[i]['date']))
    
    download_pdfs_codes(hrefs,Codes,years)
    
    '''4.解析年报'''
    import fitz
    import pandas as pd
    import re
    # 获取上市公司的营业收入和归属于上市公司股东的净利润
    def get_th_span(txt):
        nianfen='(20\d\d|199\d)\s*年末?'
        s = f'{nianfen}\s*{nianfen}.*?{nianfen}'
        p = re.compile(s,re.DOTALL)
        matchobj = p.search(txt)
        end = matchobj.end()
        year1 = matchobj.group(1)
        year2 = matchobj.group(2)
        year3 = matchobj.group(3)
        flag=(int(year1) - int(year2) ==1) and (int(year2) - int(year3) ==1)
        while (not flag):
            matchobj = p.search(txt[end:])
            end = matchobj.end()
            year1 = matchobj.group(1)
            year2 = matchobj.group(2)
            year3 = matchobj.group(3) 
            flag=(int(year1) - int(year2) ==1)
            flag=flag and (int(year2) - int(year3) ==1)
        return(matchobj.span())
    def get_bounds(txt):
        th_span_1st=get_th_span(txt)
        end = th_span_1st[1]
        th_span_2nd = get_th_span(txt[end:])
        th_span_2nd = (end+th_span_2nd[0],end+th_span_2nd[1])
        #
        s = th_span_1st[1]
        e = th_span_2nd[0]
        #
        while (txt[e] not in '0123456789'):
            e = e - 1
        return(s,e)
    def get_subtext(doc,bounds=('主要会计数据和财务指标','总资产')):
        start_pageno = 0
        end_pageno = len(doc) - 1
        lb,ub=bounds
        for n in range(len(doc)):
            page = doc[n]
            txt = page.get_text()
            if lb in txt:
                start_pageno = n; break
        for n in range(start_pageno,len(doc)):
            if ub in doc[n].get_text():
                end_pageno = n; break
        txt = ''
        for n in range(start_pageno,end_pageno+1):
            page = doc[n]
            txt += page.get_text()
        return(txt)
    def parse_fin(txt):
        sales=[]
        income=[]
        # 提取营业收入
        p_sales = re.compile('营业收入\s?\n?([-\d+,.]*)\s',re.DOTALL)
        p_sales=p_sales.findall(txt)
        if p_sales !=[]:
            sales1 = re.sub(',','',p_sales[0])
            sales.append(sales1)
        # 提取归属于上市公司股东的净利润
        p_income = re.compile('东的净利润\s?\n?([-\d+,.]*)\s',re.DOTALL)
        p_income=p_income.findall(txt)
        if p_income !=[]:
            income1 = re.sub(',','',p_income[0])
            income.append(income1)
        else:
            income.append('未提取出来')
        # 处理
        sales=pd.DataFrame(sales)
        income=pd.DataFrame(income)
        agg=[]
        agg=pd.DataFrame(agg)
        agg['营业收入']=sales.iloc[:,0:1]
        agg['归属于上市公司股东的净利润']=income.iloc[:,0:1]
        return agg
    codes=['601811','601858','601900','601921','601928','601949','601999','603096','603999','605577']
                              
    import requests
    
    hh=[[],[],[],[],[],[],[],[],[],[]]
    for m in range(len(codes)):
        findataz=[]
        colnames= ['营业收入','归属于上市公司股东的净利润']
        findataz = pd.DataFrame(findataz,columns=colnames)
        for i in range(len(years[m])):
            filename = f'{codes[m]}_{years[m][i]}.pdf'
            doc = fitz.open(filename)
            txt = get_subtext(doc)
            span = get_bounds(txt)
            subtxt = txt[span[0]:span[1]]
            fin_data=parse_fin(subtxt)
            findataz = pd.concat([findataz, fin_data])
        hh[m]=findataz
        
    hh[0].iloc[0:1,1:2]='1396673063.27'
    
    hh[2].iloc[0:1,1:2]='943358883.70'
    hh[2].iloc[4:5,1:2]='655293807.85'
    hh[2].iloc[5:6,1:2]='611423556.91'
    hh[2].iloc[6:7,1:2]='422261959.62'
    
    hh[3].iloc[0:1,1:2]='1413562895.84'
    hh[3].iloc[1:2,1:2]='1317067651.75'
    
    hh[4].iloc[1:2,1:2]='2456754308.90'
    hh[4].iloc[4:5,1:2]='1324895653.69'
    
    hh[5].iloc[0:1,1:2]='650808316.46'
    hh[5].iloc[1:2,1:2]='779934519.78'
    hh[5].iloc[2:3,1:2]='740968757.59'
    hh[5].iloc[3:4,1:2]='740968757.59'
    hh[5].iloc[4:5,1:2]='702771582.88'
    hh[5].iloc[5:6,1:2]='601490557.41'
    hh[5].iloc[6:7,1:2]='530652216.90'
    
    
    hh[6].iloc[1:2,1:2]='108765384.53'
    hh[6].iloc[2:3,1:2]='152712660.34'
    hh[6].iloc[3:4,1:2]='148286660.15'
    hh[6].iloc[4:5,1:2]='177328639.04'
    hh[6].iloc[6:7,1:2]='122946719.40'
    hh[6].iloc[7:8,1:2]='80189714.61'
    hh[6].iloc[8:9,1:2]='75009247.64'
    hh[6].iloc[9:10,1:2]='70033778.14'
    
    hh[7].iloc[0:1,1:2]='136950321.84'
    hh[7].iloc[2:3,1:2]='219689297.19'
    hh[7].iloc[3:4,1:2]='240280588.57'
    hh[7].iloc[4:5,1:2]='240814176.91'
    hh[7].iloc[5:6,1:2]='232322877.80'
    
    hh[8].iloc[2:3,1:2]='14270825.95'
    hh[8].iloc[5:6,1:2]='74435174.79'
    hh[8].iloc[7:8,1:2]='6619360.13'
    
    # 将整理出来的字符转为浮点型
    for i in range(len(hh)):
        # hh[i].astype(float)
        hh[i] = hh[i].apply(pd.to_numeric, errors='coerce')
        hh[i] = hh[i].div(1000000000)
    
    hh[5].index=range(7)
    hh[5]=hh[5].drop(3)
    
    for i in range(len(hh)):
        hh[i].index=range(2023-len(hh[i]),2023)
    
    import matplotlib
    import matplotlib.pyplot as plt
    from pylab import mpl
    mpl.rcParams['font.sans-serif']=['FangSong']
    mpl.rcParams['axes.unicode_minus']=False
    
    for i in range(len(codes)):
        plt.figure(figsize=(11,9))
        plt.subplot(2, 1, 1)
        plt.plot(hh[i]['营业收入'], 'r-',label=codes[i],lw=2.0)
        plt.xticks(fontsize=13,rotation=30)
        plt.xlabel(u'日期',fontsize=13)
        plt.yticks(fontsize=13)
        plt.ylabel(u'营业收入（单位：十亿元）',fontsize=13)
        plt.legend(loc=0, fontsize=13)
        plt.grid()
        plt.subplot(2, 1, 2)
        plt.plot(hh[i]['归属于上市公司股东的净利润'], 'b-',label=codes[i],lw=2.0)
        plt.xticks(fontsize=13,rotation=30)
        plt.xlabel(u'日期',fontsize=13)
        plt.yticks(fontsize=13)
        plt.ylabel(u'归属于上市公司股东的净利润（单位：十亿元）',fontsize=13)
        plt.legend(loc=0, fontsize=13)
        plt.grid()
    
    
    plt.figure(figsize=(11,9))
    for i in range(len(codes)):
        plt.plot(hh[i]['营业收入'], label=codes[i],lw=2.0)
    plt.xticks(fontsize=13,rotation=30)
    plt.xlabel(u'日期',fontsize=13)
    plt.yticks(fontsize=13)
    plt.ylabel(u'营业收入（单位：十亿元）',fontsize=13)
    plt.legend(loc=0, fontsize=13)
    plt.grid()
    
    # 提取公司网址，电子邮箱，办公地址，董秘姓名、电话
    def get_subtext_xx(doc,bounds=('公司的法定代表人','媒体名称')):
        start_pageno = 0
        end_pageno = len(doc) - 1
        lb,ub=bounds
        for n in range(len(doc)):
            page = doc[n]
            txt = page.get_text()
            if lb in txt:
                start_pageno = n; break
        for n in range(start_pageno,len(doc)):
            if ub in doc[n].get_text():
                end_pageno = n; break
        txt = ''
        for n in range(start_pageno,end_pageno+1):
            page = doc[n]
            txt += page.get_text()
        return(txt)
    
    def parse_dm_acc(txt):
        # 董秘姓名
        dm_name = re.compile('证券事务代表\s+姓名\s+([\u2E80-\u9FFF]+)\s+[\u2E80-\u9FFF]',re.DOTALL)
        dm_name=dm_name.findall(txt)
        # 提取董秘电话
        dm_pho = re.compile('电话\s?\n?([-\d+,.]*)\s',re.DOTALL)
        dm_pho = dm_pho.findall(txt)
        # 提取公司网址
        acc_her = re.compile('(?<=公司网址)(.*?)(?=电子信箱)',re.DOTALL)
        acc_her = acc_her.findall(txt)
        # 提取电子邮箱
        acc_ema = re.compile('(?<=公司网址).*?(?<=电子信箱)\s+(.*?)\s+(?=四、 信息披露及备置地点)',re.DOTALL)
        acc_ema = acc_ema.findall(txt)
        # 提取办公地址
        acc_sta = re.compile('(?<=办公地址)(.*?)(?=公司办公地址的)',re.DOTALL)
        acc_sta = acc_sta.findall(txt)
        # 处理
        dm_acc=pd.DataFrame(index=range(1))
        dm_acc['董秘姓名']=dm_name
        dm_acc['董秘电话']=dm_pho
        dm_acc['公司网址']=acc_her
        dm_acc['电子邮箱']=acc_ema
        dm_acc['办公地址']=acc_sta
        return dm_acc
    
    d_a=[]
    colname= ['董秘姓名','董秘电话','公司网址','电子邮箱','办公地址']
    d_a = pd.DataFrame(d_a,columns=colname)
    for m in range(len(codes)):
        filename = f'{codes[m]}_{years[m][1]}.pdf'
        doc = fitz.open(filename)
        txt = get_subtext_xx(doc)
        d_a_data=parse_dm_acc(txt)
        d_a = pd.concat([d_a, d_a_data])
    
    d_a.to_csv('公司基本信息.csv',encoding='utf_8_sig')