import pandas as pd
import openpyxl
import re

xlsx = 'C:/Users/lenovo/文化娱乐行业.xlsx'

df = pd.read_excel(xlsx)

exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']

links = [c.value for c in C]

links_1 = links[1:-1]
links_2 = ''.join(links_1)

sample  = '=HYPERLINK("http://news.windin.com/ns/bulletin.php?code=239751740736&id=126293652&type=1","万达电影:2019年年度报告(更新后)")'

p = re.compile('"(.*?)","(.*?)"')
list_of_tuple = p.findall(links_2)

df2 = pd.DataFrame({'link':[t[0] for t in list_of_tuple], 'f_name':[t[1] for t in list_of_tuple]})

df2.to_csv('文化娱乐行业.csv')

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_8312/325364261.py in <module>
      5 xlsx = 'C:/Users/lenovo/文化娱乐行业.xlsx'
      6 
----> 7 df = pd.read_excel(xlsx)
      8 
      9 exf = openpyxl.load_workbook(xlsx)

C:\ProgramData\Anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
    309                     stacklevel=stacklevel,
    310                 )
--> 311             return func(*args, **kwargs)
    312 
    313         return wrapper

C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\excel\_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, storage_options)
    362     if not isinstance(io, ExcelFile):
    363         should_close = True
--> 364         io = ExcelFile(io, storage_options=storage_options, engine=engine)
    365     elif engine and engine != io.engine:
    366         raise ValueError(

C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\excel\_base.py in __init__(self, path_or_buffer, engine, storage_options)
   1189                 ext = "xls"
   1190             else:
-> 1191                 ext = inspect_excel_format(
   1192                     content_or_path=path_or_buffer, storage_options=storage_options
   1193                 )

C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\excel\_base.py in inspect_excel_format(content_or_path, storage_options)
   1068         content_or_path = BytesIO(content_or_path)
   1069 
-> 1070     with get_handle(
   1071         content_or_path, "rb", storage_options=storage_options, is_text=False
   1072     ) as handle:

C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    709         else:
    710             # Binary mode
--> 711             handle = open(handle, ioargs.mode)
    712         handles.append(handle)
    713 

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/lenovo/文化娱乐行业.xlsx'


import re 
import pandas as pd
import os
import fitz
import time
import pdfplumber

df = pd.read_csv('C:/Users/lenovo/文化娱乐行业.csv')
#标准化年报文件名
p = re.compile('(?<=\d{4}(年度))')
f_names = [p.sub('年年度报告', f) for f in df.f_name]
df['f_name'] = f_names; del p,f_names

def filter_links(words,df,include=True):
    ls = []
    for word in words:
        if include:
            ls.append([word in f for f in df.f_name])
        else:
            ls.append([word not in f for f in df.f_name])
    i  = []
    for r in range(len(df)):
        flag  = not include
        for c in range(len(words)):
            if include:
                ls.append([word not in f for f in df.f_name])
                index=[]
                for r in range(len(df)):
                    flag=not include
                    for c in range(len(words)):
                        if include:
                            flag = flag or ls[c][r]
                        else:
                            flag = flag and ls[c][r]
                    index.append(flag)
                df2=df[index]
                return(df2)
                
df_all = filter_links(['摘要','问询函','社会责任','审计','财务','风险','债券'],df,include=[False])
df_orig = filter_links(['(','('],df_all,include=[False])
df_updt = filter_links(['(','(',],df_all,include=[True])
df_updt = filter_links(['取消'], df_updt,include=[False])


def sub_with_update(df_updt,df_orig):
    df_newest = df_orig.copy()
    index_orig=[]
    index_updt=[]
    for i,f in enumerate(df_orig.f_name):
        for j,fn in enumerate(df_updt.f_name):
            if f in fn:
             index_orig.append(i)
             index_updt.append(j)
    #return ((index_orig,index_updt))
    for n in range(len(index_orig)):
        i = index_orig[n]
        j = index_updt[n]
        df_orig.iloc[i,-2] = df_updt.iloc[j,-2]
        #df_newest.iloc[i,-1] = df_updt.i;oc[j,-1]
    return(df_newest)

df_newest  = sub_with_update(df_updt,df_orig)

df_all.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]

counts = df_newest['公司简称'].value_counts()

ten_company = []
for cn in counts.index[:10]:
    ten_company.append(filter_links([cn],df_newest))
    
if not os.path.exists('10companies'):
    os.makedirs('10companies')
    
for df_com in ten_company:
    cn=df_com['公司简称'].iloc[0]
    df_com.to_csv('10companies/%s.csv' % cn)
    
ten_csv=os.listdir('10companies')
    
os.chdir('C:/Users/lenovo/10companies')
f_1=os.listdir()


links= []
f_names=[]
links = df['link']; f_names = df['f_name']

def get_PDF_url(url):
    r = requests.get(url);r.encoding = 'utf-8'; html = r.text
    r.close() # 已获取html内容，结束connection
    p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>', re.DOTALL)
    a = p.search(html) # 因第一个<a>即是目标标签，故用search
    if a is None:
        Warning('没有找到下载链接。请手动检查链接：%s' % url)
        return()
    else:
        href = a.group(1); fname = a.group(2).strip()
    href = r.url[:26] + href # 形成完整的链接
    return((href,fname))

hrefs=[];fnames=[]
for link in links:
    href,fname = get_PDF_url(link)
    hrefs.append(href)
    fnames.append(fname)
    time.sleep(10)
    df_final_links=pd.DataFrame({'href':hrefs,'fname':fnames})
    df_final_links.to_csv('文化娱乐行业.csv')
    

file_dir=r'C:/Users/lenovo/10companies'
file_list=[]
for files in os.walk(file_dir):
    for file in files[2]:
        if os.path.splitext(file)[1]=='.pdf' or os.path.splitext(file)[1]=='.PDF':
            file_list.append(file_dir+'\\'+file)
            
for i in range(len(file_list)):
    pdf=pdfplumber.open(file_list[i])
    pages=pdf.pages
    page=pages[-3]
    tables=page.extract_tables()
    table=tables[0]
    print(table)
    
bg=pd.DataFrame(table[1:],columns=table[0])
print(bg)

C:\ProgramData\Anaconda3\lib\site-packages\pandas\util\_decorators.py:311: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_8312/3219937808.py in <module>
    103 hrefs=[];fnames=[]
    104 for link in links:
--> 105     href,fname = get_PDF_url(link)
    106     hrefs.append(href)
    107     fnames.append(fname)

~\AppData\Local\Temp/ipykernel_8312/3219937808.py in get_PDF_url(url)
     89 
     90 def get_PDF_url(url):
---> 91     r = requests.get(url);r.encoding = 'utf-8'; html = r.text
     92     r.close() # 已获取html内容，结束connection
     93     p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>', re.DOTALL)

NameError: name 'requests' is not defined


import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
rcParams['font.family'] = 'simhei'

df = pd.read_excel("C:/Users/lenovo/Desktop/营业收入数据表.xlsx")
print(df)

plt.plot(df["年份"],df["奥飞娱乐"],label='奥飞娱乐',linewidth=1,color='r',markersize=12)
plt.plot(df["年份"],df["万达电影"],label='万达电影',linewidth=1,color='y',markersize=12)
plt.plot(df["年份"],df["完美世界"],label='完美世界',linewidth=1,color='blue',markersize=12)
plt.plot(df["年份"],df["世纪华通"],label='世纪华通',linewidth=1,color='purple',markersize=12)
plt.plot(df["年份"],df["盛讯达"],label='盛讯达',linewidth=1,color='green',markersize=12)
plt.plot(df["年份"],df["盛天网络"],label='盛天网络',linewidth=1,color='brown',markersize=12)
plt.plot(df["年份"],df["北京文化"],label='北京文化',linewidth=1,color='black',markersize=12)
plt.plot(df["年份"],df["艾格拉斯"],label='艾格拉斯',linewidth=1,color='pink',markersize=12)
plt.plot(df["年份"],df["华谊兄弟"],label='华谊兄弟',linewidth=1,color='grey',markersize=12)
plt.plot(df["年份"],df["金科文化"],label='金科文化',linewidth=1,color='violet',markersize=12)


plt.xlabel("年份")
plt.ylabel('营业总收入')
plt.title("营业收入走势图")
plt.legend()
plt.grid()
plt.show()

       年份     奥飞娱乐      万达电影      完美世界      世纪华通       盛讯达    盛天网络     北京文化  \
0  2012年报  1291.16   3031.11    407.08    928.90   3031.11  170.43   166.11   
1  2013年报  1553.01   4022.56    990.09   1227.43   4022.56  213.13   162.86   
2  2014年报  2429.67   5338.99    925.78   1712.23   5338.99  231.71   420.69   
3  2015年报  2589.17   8000.73   4893.07   3025.83   8000.73  239.79   349.35   
4  2016年报  3360.67  11209.32   6158.83   3455.90  11209.32  344.11   926.55   
5  2017年报  3642.46  13229.38   7929.82   3490.82  13229.38  388.37  1321.00   
6  2018年报  2839.79  16287.42   8033.77  12523.52  16287.42  496.78   741.36   
7  2019年报  2726.92  15435.36   8039.02  14689.72  15435.36  660.72   855.34   
8  2020年报  2368.20   6295.48  10224.77  14982.97   6295.48  895.88   425.78   
9  2021年报  2004.45   9271.48   6739.49  10785.98   9271.48  959.74   158.81   

     艾格拉斯     华谊兄弟     金科文化  
0  308.32  1386.40   464.67  
1  505.29  2013.96   480.41  
2  375.84  2389.02   476.90  
3  489.54  3873.57   506.44  
4  562.98  3503.46   894.56  
5  845.09  3946.28  1396.27  
6  829.29  3814.47  2725.15  
7  553.67  2243.55  1943.03  
8  181.90  1500.00  1807.22  
9   44.92   955.54  1402.05


import pandas as pd
import matplotlib.pyplot as plt
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
from matplotlib import rcParams
rcParams['font.family'] = 'simhei'

data = pd.read_excel('C:/Users/lenovo/Desktop/营业成本.xlsx').set_index(['年份'])

plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['奥飞娱乐','万达电影','完美世界','世纪华通','盛讯达','盛天网络','北京文化','艾格拉斯','华谊兄弟','金科文化']
data0=data.iloc[0]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2017年营业成本对比（单位：万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['奥飞娱乐','万达电影','完美世界','世纪华通','盛讯达','盛天网络','北京文化','艾格拉斯','华谊兄弟','金科文化']
data0=data.iloc[1]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2018年营业成本对比（单位：万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name =['奥飞娱乐','万达电影','完美世界','世纪华通','盛讯达','盛天网络','北京文化','艾格拉斯','华谊兄弟','金科文化']
data0=data.iloc[2]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2019年营业成本对比（单位：万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['奥飞娱乐','万达电影','完美世界','世纪华通','盛讯达','盛天网络','北京文化','艾格拉斯','华谊兄弟','金科文化']
data0=data.iloc[3]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2020年营业成本对比（单位：万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name =['奥飞娱乐','万达电影','完美世界','世纪华通','盛讯达','盛天网络','北京文化','艾格拉斯','华谊兄弟','金科文化']
data0=data.iloc[4]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2021年营业成本对比（单位：万元)')
plt.show()

实验报告：文化娱乐行业分析¶

#第一步：读取文化娱乐行业的Excel，并转为CSV格式¶

#第二步：读取CSV文件，筛选符合条件的文件并下载¶

#第三步：读取相关的数据内容，并画出图表分析，首先是十个企业近十年的营业总收入对比图¶

#第四步：分析这十个公司近五年，每年的营业成本，并作图对比¶