李敏的实验报告

1.运行结果

1.1十家公司

酒、饮料和精制茶制造业(由于从后往前数公司年报较少,故从前往后选取的第15-25个)

十家公司

1.2年报获取

十家公司共获取95份年报:600365(2015-2022)、600519(2016-2022)、其他(2013-2022)

html

html

pdf

年报1 年报2

1.3解析年报

营收与净利润数据1 营收与净利润数据2

1.4画图

一家公司十年营收图 一家公司十年净利润图 十家公司十年营收图

1.5公司及董秘基本信息

公司及董秘基本信息

2.代码

代码文件夹图片

2.1年报获取与下载

sse

 
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import re
import pandas as pd

def get_table_sse(code):
    browser = webdriver.Chrome()
    browser.set_window_size(1552, 840)
    url='http://www.sse.com.cn/disclosure/listedinfo/regular/'
    browser.get(url)
    time.sleep(3)
    browser.find_element(By.ID, "inputCode").click()
    browser.find_element(By.ID, "inputCode").send_keys(code)
    time.sleep(3)
    selector =".sse_outerItem:nth-child(4) .filter-option-inner-inner"
    browser.find_element(By.CSS_SELECTOR, selector).click()
    browser.find_element(By.LINK_TEXT,'年报').click()
    time.sleep(3)
    #
    selector = "body > div.container.sse_content > div > div.col-lg-9.col-xxl-10 > div > div.sse_colContent.js_regular > div.table-responsive > table"
    element=browser.find_element(By.CSS_SELECTOR, selector)
    table_html = element.get_attribute('innerHTML')
    #
    fname=f'{code}.html'
    f = open(fname,'w',encoding='utf-8')
    f.write(table_html)
    f.close()
    #
    browser.quit()
    
def get_table_sse_codes(codes):
    for code in codes:
        get_table_sse(code)
        
def get_data(tr):
    p_td = re.compile('(.*?)', re.DOTALL)
    tds = p_td.findall(tr)
    #
    s = tds[0].find('>') + 1
    e = tds[0].rfind('<')
    code = tds[0][s:e]
    #
    s = tds[1].find('>') + 1
    e = tds[1].rfind('<')
    name = tds[1][s:e]
    #
    s = tds[2].find('href="') + 6
    e = tds[2].find('.pdf"') + 4
    href = 'http://www.sse.com.cn' + tds[2][s:e]
    s = tds[2].find('$(this))">') + 10
    e = tds[2].find('')
    title = tds[2][s:e]
    #
    date = tds[3].strip()
    
    data = [code,name,href,title,date]
    return(data)

def parse_table(code,save=True):
    fname=f'{code}.html'
    f = open(fname, encoding='utf-8')
    html = f.read()
    f.close()
    #
    p = re.compile('(.+?)', re.DOTALL)
    trs = p.findall(html)
    # 
    trs_new = []
    for tr in trs:
        if tr.strip() != '':
            trs_new.append(tr)
    # 
    data_all = [get_data(tr) for tr in trs_new[1:]]
    df = pd.DataFrame({
        'code': [d[0] for d in data_all],
        'name': [d[1] for d in data_all],
        'href': [d[2] for d in data_all],
        'title': [d[3] for d in data_all],
        'date': [d[4] for d in data_all]
        })
    #
    if save:
        df.to_csv(f'{fname[0:-5]}.CSV')
    return(df)


filter_url

 
'''筛选过滤掉一些不必要的公告链接'''

import datetime
def filter_words(words,df,include=True):
    Is = []
    for word in words:
        if include:
            Is.append([word in f for f in df['title']])
        else:
            Is.append([word not in f for f in df['title']])
    index=[]
    for r in range(len(df)):
        flag=not include
        for c in range(len(words)):
            if include:
                flag=flag or Is[c][r]
            else:
                flag=flag and Is[c][r]
        index.append(flag)
    df2=df[index]
    return(df2)

def filter_date(start,end,df):
    date=df['date']
    v=[d >= start and d <= end for d in date]
    df_new=df[v]
    return(df_new)

def start_end_10y():
    dt_now=datetime.datetime.now()
    current_year=dt_now.year
    start=f'{current_year-9}-01-01'
    end=f'{current_year}-12-31'
    return(start,end)

def filter_nb_10y(df,keep_words=['年报','年度报告'],exclude_words=['摘要'],start=''):
    if start == '':
        start,end=start_end_10y()
    else:
        start_y=int(start[0:4])
        end=f'{start_y + 9}-12-31'
    #
    df=filter_words(keep_words,df,include=True)
    df=filter_words(exclude_words,df,include=False)
    df=filter_date(start,end,df)
    return(df)

def prepare_hrefs_years(df):
    hrefs=df['href'].to_list()
    years=[int(d[:4])-1 for d in df['date']]
    return(hrefs,years)


download

 
'''下载年报'''

import requests
import time
from sse import get_table_sse,get_table_sse_codes,parse_table
from filter_url import filter_words,filter_date,filter_nb_10y,prepare_hrefs_years
import pandas as pd

def download_pdf(href,code,year):
    r=requests.get(href,allow_redirects=True)
    fname=f'{code}_{year}.pdf'
    f=open(fname,'wb')
    f.write(r.content)
    f.close()
    #
    r.close()
    
def download_pdfs(hrefs,code,years):
    for i in range(len(hrefs)):
        href=hrefs[i]
        year=years[i]
        download_pdf(href,code,year)
        time.sleep(30)
    return()

#导入各包、下载年报
from sse import get_table_sse,get_table_sse_codes,parse_table
from filter_url import filter_words,filter_date,filter_nb_10y,prepare_hrefs_years
import pandas as pd

codes=['600059','600084','600132','600189','600197','600199','600238','600300','600365','600519']
for code in codes: 
    get_table_sse(code)
    df = parse_table(code)
    csv_final=filter_nb_10y(df,keep_words=['年报','年度报告'],exclude_words=['摘要'],start='')
    hrefs,years=prepare_hrefs_years(csv_final)
    pdf=download_pdfs(hrefs,code,years)


2.2年报解析与画图

parse_ar

 
'''解析年报'''

import fitz
import pandas as pd
import re

def get_subtxt(doc,bounds=('主要会计数据和财务指标','总资产')):
    #默认设置为首尾页码
    start_pageno=0
    end_pageno=len(doc)-1
    #
    lb,ub=bounds
    #获取左界页码
    for n in range(len(doc)):
        page=doc[n]
        txt=page.get_text()
        if lb in txt:
            start_pageno=n
            break
    #获取右界页码
    for n in range(start_pageno,len(doc)):
        if ub in doc[n].get_text():
            end_pageno=n
            break
    #获取小范围内字符串
    txt=''
    for n in range(start_pageno,end_pageno+1):
        page=doc[n]
        txt += page.get_text()
    return(txt)

def get_th_span(txt):
    nianfen='(20\d\d|199\d)\s*?年'   
    s=f'{nianfen}\s*{nianfen}.*?{nianfen}'  
    p=re.compile(s,re.DOTALL)  #re.DOTALL指.遇到换行符也是可以的
    matchobj=p.search(txt)
    #
    end=matchobj.end()
    year1=matchobj.group(1)
    year2=matchobj.group(2)
    year3=matchobj.group(3)
    #
    flag=(int(year1)-int(year2) == 1) and (int(year2)-int(year3) == 1)
    #
    while (not flag):
        matchobj=p.search(txt[end:])
        end=matchobj.end()
        year1=matchobj.group(1)
        year2=matchobj.group(2)
        year3=matchobj.group(3)
        flag=(int(year1)-int(year2) == 1)
        flag=flag and (int(year2)-int(year3) ==1)
    return(matchobj.span())

def get_bounds(txt):
    th_span_1st=get_th_span(txt)
    end=th_span_1st[1]
    th_span_2nd=get_th_span(txt[end:])
    th_span_2nd=(end+th_span_2nd[0],end+th_span_2nd[1])
    #
    s=th_span_1st[1]
    e=th_span_2nd[0]-1
    #
    while (txt[e] not in '0123456789'):  #如果最后一个不是数字
        e=e-1
    return(s,e)


def get_keywords(txt):
    p=re.compile(r'\d+\s+([\u2E80-\u9FFF]+)')
    keywords=p.findall(txt) 
    keywords.insert(0,'营业收入')
    return(keywords)

def parse_key_fin_data(subtxt,keywords):
    ss=[]
    s=0
    for kw in keywords:
        n=subtxt.find(kw,s)
        ss.append(n)
        s=n+len(kw)
    ss.append(len(subtxt))
    data=[]
    p=re.compile('\D+(?:\s+\D*)?(?:(.*)|\(.*\))?')
    p2=re.compile('\s')
    for n in range(len(ss)-1):
        s=ss[n]
        e=ss[n+1]
        line=subtxt[s:e]
        #获取可能换行的账户名称
        matchobj=p.search(line)
        account_name=p2.sub('',matchobj.group())
        #获取三年数据
        amnts=line[matchobj.end():].split()
        #加上账户名称
        amnts.insert(0,account_name)
        #追加到总数据
        data.append(amnts)
    return data

def get_account_data(account,txt):
    p_txt='%s\D*?(\d{1,3}(?:,\d{3})*(?:\.\d+)?)' % account   #%s是占位符,用‘account’替换,\D是非数字,\d{1,3}是数字1或2或3个,*可重复,?非贪婪,()内是所要的数字,小数点后\d+表示小数点后至少一位数字
    p=re.compile(p_txt)
    matchobj=p.search(txt)
    amt=matchobj.group(1)
    return(amt)


codes=[600059,600084,600132,600189,600197,600199,600238,600300,600365,600519]
for code in codes:
    import os
    fname=[]
    #遍历
    def main():
        file_path = f'D:/桌面/python/金融数据获取/nianbao/src/大作业/{code}'
        folders = os.listdir(file_path)           
        for file in folders:                      
            if(file.split('.')[-1]=='pdf'):
                fname.append(file)                
    if __name__ == '__main__':
     main()
    
    locals()[f'df_{code}']=pd.DataFrame(index=range(2013,2023),
                                        columns=['营业收入(元)','归属于上市公司股东的净利润(元)'])
    for f in fname:
        doc=fitz.open(f'D:/桌面/python/金融数据获取/nianbao/src/大作业/{code}/{f}')
        txt=get_subtxt(doc)
        revenue=get_account_data('营业收入',txt)
        profit=get_account_data('\s*'.join('归属于上市公司股东的净利润'),txt)
        text=''
        for i in range(20): 
            page = doc[i]
            text += page.get_text()

        p_year=re.compile('.*?(\d{4}) .*?年度报告.*?') 
        year = int(p_year.findall(text)[0])
        
        locals()[f'df_{code}'].loc[year,'营业收入(元)']=revenue  
        locals()[f'df_{code}'].loc[year,'归属于上市公司股东的净利润(元)']=profit
        locals()[f'df_{code}'].to_csv(f'D:/桌面/python/金融数据获取/nianbao/src/大作业/营业收入与净利润数据/{code}.csv')


draw

 
'''画图'''

import matplotlib.pyplot as plt
from pylab import mpl
import pandas as pd

#解决中文和负号显示
mpl.rcParams['font.sans-serif']=['SimHei']
mpl.rcParams['axes.unicode_minus']=False  

codes=[600059,600084,600132,600189,600197,600199,600238,600300,600365,600519]
for code in codes:
    locals()[f'df_{code}']=pd.read_csv(f'D:/桌面/python/金融数据获取/nianbao/src/大作业/营业收入与净利润数据/{code}.csv',
                                       sep=',',encoding="utf-8")
    locals()[f'df_{code}'].columns =['时间','营业收入','归属于上市公司股东的净利润']
    locals()[f'df_{code}'].set_index('时间',inplace=True) 
    
    #将字符串转换为浮点型
    locals()[f'df_{code}']['营业收入'] = locals()[f'df_{code}']['营业收入'].str.replace(',', '').astype(float)
    locals()[f'df_{code}']['归属于上市公司股东的净利润'] = locals()[f'df_{code}']['归属于上市公司股东的净利润'].str.replace(',', '').astype(float)

#选取600059古越龙山公司画近十年营业收入与净利润变化时间序列图
plt.figure(figsize=(9,6))
plt.plot(df_600059['营业收入']/100000000, color='b',marker='*',markersize=10) 
plt.xlabel(u'时间',fontsize=15)
plt.ylabel(u'营业收入(亿元)',fontsize=15)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.title(u'古越龙山近十年营业收入时间趋势变化图',fontsize=15)
plt.grid()
plt.savefig("D:/桌面/python/金融数据获取/nianbao/src/大作业/P1")
plt.show()

plt.figure(figsize=(9,6))
plt.plot(df_600059['归属于上市公司股东的净利润']/100000000, color='r',marker='^',markersize=10) 
plt.xl4bel(u'时间',fontsize=15)
plt.ylabel(u'归属于上市公司股东的净利润(亿元)',fontsize=15)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.title(u'古越龙山近十年归属于上市公司股东的净利润时间趋势变化图',fontsize=15)
plt.grid()
plt.savefig("D:/桌面/python/金融数据获取/nianbao/src/大作业/p2")
plt.show()

#绘制十家公司近十年营业收入时间序列图
plt.figure(figsize=(15,10))
plt.plot(df_600059['营业收入']/100000000, color='#FF6347',label='古越龙山',marker='*',markersize=10) 
plt.plot(df_600084['营业收入']/100000000, color='#00FFFF',label='*ST中葡',marker='^',markersize=10) 
plt.plot(df_600132['营业收入']/100000000, color='#8A2BE2',label='重庆啤酒',marker='p',markersize=10) 
plt.plot(df_600189['营业收入']/100000000, color='#48D1CC',label='泉阳泉',marker='x',markersize=10) 
plt.plot(df_600197['营业收入']/100000000, color='#FFA500',label='伊力特',marker='o',markersize=10) 
plt.plot(df_600199['营业收入']/100000000, color='#FFC0CB',label='金种子酒',marker='s',markersize=10) 
plt.plot(df_600238['营业收入']/100000000, color='#FF00FF',label='海南椰岛',marker='H',markersize=10) 
plt.plot(df_600300['营业收入']/100000000, color='#00FF00',label='ST维维',marker='h',markersize=10) 
plt.plot(df_600365['营业收入']/100000000, color='#FFD700',label='ST通葡',marker='*',markersize=10) 
plt.plot(df_600519['营业收入']/100000000, color='#1E90FF',label='贵州茅台',marker='d',markersize=10) 

plt.xlabel(u'时间',fontsize=17)
plt.ylabel(u'营业收入(亿元)',fontsize=17)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.title(u'10家公司近十年营业收入时间趋势变化图',fontsize=22)
plt.grid()
plt.legend(fontsize=15)
plt.savefig("D:/桌面/python/金融数据获取/nianbao/src/大作业/p3")
plt.show()


2.3公司及董秘基本信息获取

get information

 
import re
import fitz
import pandas as pd
from parse_ar import get_subtxt,get_account_data

def get_com_ifm(txt,keywords=['公司办公地址','公司网址','电子信箱']):
    s=txt.find('基本情况简介')
    e=txt.find('信息披露及备置地点',s)
    subtxt=txt[s:e]
    data=[]
    for kw in keywords:
        p=re.compile('%s\s*\n\s*(.+)' % kw)
        matchobj=p.search(subtxt)
        if matchobj:
            ifm=matchobj.group(1)
            if ifm[-1] == ' ':
                ifm=ifm[:-1]
        else:
            ifm='无'
        data.append([kw,ifm])
    return data

def get_per_ifm(txt,keywords=['姓名','电话','电子信箱']):
    s=txt.find('联系人和联系方式')
    e=txt.find('基本情况简介',s)
    subtxt=txt[s:e]
    data=[]
    for kw in keywords:
        p=re.compile('%s\s*\n\s*(.+)' % kw)
        matchobj=p.search(subtxt)
        if matchobj:
            ifm=matchobj.group(1)
            if ifm[-1] == ' ':
                ifm=ifm[:-1]
        else:
            p=re.compile('%s\s*(.+)' % kw)
            matchobj=p.search(subtxt)
            if matchobj:
                ifm=matchobj.group(1)
                if ifm[-1] == ' ':
                    ifm=ifm[:-1]
            else:
                ifm='无'
        data.append([kw,ifm])
    return data

codes=[600059,600084,600132,600189,600197,600199,600238,600300,600365,600519]
years=[2013,2014,2015,2016,2017,2018,2019,2020,2021,2022]
revenues=pd.DataFrame(index=years,columns=codes)
profits_shlder=pd.DataFrame(index=years,columns=codes)
col_name=['古越龙山','*ST中葡','重庆啤酒','泉阳泉','伊力特','金种子酒','海南椰岛','ST维维','ST通葡','贵州茅台']
revenues.columns=col_name
profits_shlder.columns=col_name

#调用包
bsc=pd.DataFrame()
for code in codes:
    filename=f'D:/桌面/python/金融数据获取/nianbao/src/大作业/{code}/{code}_2022.pdf'
    doc=fitz.open(filename)
    txt=get_subtxt(doc,bounds=('联系人和联系方式','信息披露及备置地点'))
    #
    data1=get_com_ifm(txt)
    bsc.loc[code,'公司办公地址']=data1[0][1]
    bsc.loc[code,'公司网址']=data1[1][1]
    bsc.loc[code,'电子信箱']=data1[2][1]
    #
    data2=get_per_ifm(txt)
    bsc.loc[code,'董事会秘书姓名']=data2[0][1]
    bsc.loc[code,'董事会秘书电话']=data2[1][1]
    bsc.loc[code,'董事会秘书电子信箱']=data2[2][1]  
   
    bsc = bsc.rename_axis("公司代码")  
    bsc.insert(0, '公司简称',(col_name+[None]*len(bsc))[:len(bsc)]) 
    bsc.to_csv('D:/桌面/python/金融数据获取/nianbao/src/大作业/公司及董事会秘书基本信息.csv')