import pandas as pd
import openpyxl
import re
xlsx = 'C:/Users/lenovo/文化娱乐行业.xlsx'
df = pd.read_excel(xlsx)
exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']
links = [c.value for c in C]
links_1 = links[1:-1]
links_2 = ''.join(links_1)
sample = '=HYPERLINK("http://news.windin.com/ns/bulletin.php?code=239751740736&id=126293652&type=1","万达电影:2019年年度报告(更新后)")'
p = re.compile('"(.*?)","(.*?)"')
list_of_tuple = p.findall(links_2)
df2 = pd.DataFrame({'link':[t[0] for t in list_of_tuple], 'f_name':[t[1] for t in list_of_tuple]})
df2.to_csv('文化娱乐行业.csv')
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_8312/325364261.py in <module> 5 xlsx = 'C:/Users/lenovo/文化娱乐行业.xlsx' 6 ----> 7 df = pd.read_excel(xlsx) 8 9 exf = openpyxl.load_workbook(xlsx) C:\ProgramData\Anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs) 309 stacklevel=stacklevel, 310 ) --> 311 return func(*args, **kwargs) 312 313 return wrapper C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\excel\_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, storage_options) 362 if not isinstance(io, ExcelFile): 363 should_close = True --> 364 io = ExcelFile(io, storage_options=storage_options, engine=engine) 365 elif engine and engine != io.engine: 366 raise ValueError( C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\excel\_base.py in __init__(self, path_or_buffer, engine, storage_options) 1189 ext = "xls" 1190 else: -> 1191 ext = inspect_excel_format( 1192 content_or_path=path_or_buffer, storage_options=storage_options 1193 ) C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\excel\_base.py in inspect_excel_format(content_or_path, storage_options) 1068 content_or_path = BytesIO(content_or_path) 1069 -> 1070 with get_handle( 1071 content_or_path, "rb", storage_options=storage_options, is_text=False 1072 ) as handle: C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 709 else: 710 # Binary mode --> 711 handle = open(handle, ioargs.mode) 712 handles.append(handle) 713 FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/lenovo/文化娱乐行业.xlsx'
import re
import pandas as pd
import os
import fitz
import time
import pdfplumber
df = pd.read_csv('C:/Users/lenovo/文化娱乐行业.csv')
#标准化年报文件名
p = re.compile('(?<=\d{4}(年度))')
f_names = [p.sub('年年度报告', f) for f in df.f_name]
df['f_name'] = f_names; del p,f_names
def filter_links(words,df,include=True):
ls = []
for word in words:
if include:
ls.append([word in f for f in df.f_name])
else:
ls.append([word not in f for f in df.f_name])
i = []
for r in range(len(df)):
flag = not include
for c in range(len(words)):
if include:
ls.append([word not in f for f in df.f_name])
index=[]
for r in range(len(df)):
flag=not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df2=df[index]
return(df2)
df_all = filter_links(['摘要','问询函','社会责任','审计','财务','风险','债券'],df,include=[False])
df_orig = filter_links(['(','('],df_all,include=[False])
df_updt = filter_links(['(','(',],df_all,include=[True])
df_updt = filter_links(['取消'], df_updt,include=[False])
def sub_with_update(df_updt,df_orig):
df_newest = df_orig.copy()
index_orig=[]
index_updt=[]
for i,f in enumerate(df_orig.f_name):
for j,fn in enumerate(df_updt.f_name):
if f in fn:
index_orig.append(i)
index_updt.append(j)
#return ((index_orig,index_updt))
for n in range(len(index_orig)):
i = index_orig[n]
j = index_updt[n]
df_orig.iloc[i,-2] = df_updt.iloc[j,-2]
#df_newest.iloc[i,-1] = df_updt.i;oc[j,-1]
return(df_newest)
df_newest = sub_with_update(df_updt,df_orig)
df_all.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]
counts = df_newest['公司简称'].value_counts()
ten_company = []
for cn in counts.index[:10]:
ten_company.append(filter_links([cn],df_newest))
if not os.path.exists('10companies'):
os.makedirs('10companies')
for df_com in ten_company:
cn=df_com['公司简称'].iloc[0]
df_com.to_csv('10companies/%s.csv' % cn)
ten_csv=os.listdir('10companies')
os.chdir('C:/Users/lenovo/10companies')
f_1=os.listdir()
links= []
f_names=[]
links = df['link']; f_names = df['f_name']
def get_PDF_url(url):
r = requests.get(url);r.encoding = 'utf-8'; html = r.text
r.close() # 已获取html内容,结束connection
p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>', re.DOTALL)
a = p.search(html) # 因第一个<a>即是目标标签,故用search
if a is None:
Warning('没有找到下载链接。请手动检查链接:%s' % url)
return()
else:
href = a.group(1); fname = a.group(2).strip()
href = r.url[:26] + href # 形成完整的链接
return((href,fname))
hrefs=[];fnames=[]
for link in links:
href,fname = get_PDF_url(link)
hrefs.append(href)
fnames.append(fname)
time.sleep(10)
df_final_links=pd.DataFrame({'href':hrefs,'fname':fnames})
df_final_links.to_csv('文化娱乐行业.csv')
file_dir=r'C:/Users/lenovo/10companies'
file_list=[]
for files in os.walk(file_dir):
for file in files[2]:
if os.path.splitext(file)[1]=='.pdf' or os.path.splitext(file)[1]=='.PDF':
file_list.append(file_dir+'\\'+file)
for i in range(len(file_list)):
pdf=pdfplumber.open(file_list[i])
pages=pdf.pages
page=pages[-3]
tables=page.extract_tables()
table=tables[0]
print(table)
bg=pd.DataFrame(table[1:],columns=table[0])
print(bg)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\util\_decorators.py:311: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return func(*args, **kwargs)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_8312/3219937808.py in <module> 103 hrefs=[];fnames=[] 104 for link in links: --> 105 href,fname = get_PDF_url(link) 106 hrefs.append(href) 107 fnames.append(fname) ~\AppData\Local\Temp/ipykernel_8312/3219937808.py in get_PDF_url(url) 89 90 def get_PDF_url(url): ---> 91 r = requests.get(url);r.encoding = 'utf-8'; html = r.text 92 r.close() # 已获取html内容,结束connection 93 p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>', re.DOTALL) NameError: name 'requests' is not defined
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
rcParams['font.family'] = 'simhei'
df = pd.read_excel("C:/Users/lenovo/Desktop/营业收入数据表.xlsx")
print(df)
plt.plot(df["年份"],df["奥飞娱乐"],label='奥飞娱乐',linewidth=1,color='r',markersize=12)
plt.plot(df["年份"],df["万达电影"],label='万达电影',linewidth=1,color='y',markersize=12)
plt.plot(df["年份"],df["完美世界"],label='完美世界',linewidth=1,color='blue',markersize=12)
plt.plot(df["年份"],df["世纪华通"],label='世纪华通',linewidth=1,color='purple',markersize=12)
plt.plot(df["年份"],df["盛讯达"],label='盛讯达',linewidth=1,color='green',markersize=12)
plt.plot(df["年份"],df["盛天网络"],label='盛天网络',linewidth=1,color='brown',markersize=12)
plt.plot(df["年份"],df["北京文化"],label='北京文化',linewidth=1,color='black',markersize=12)
plt.plot(df["年份"],df["艾格拉斯"],label='艾格拉斯',linewidth=1,color='pink',markersize=12)
plt.plot(df["年份"],df["华谊兄弟"],label='华谊兄弟',linewidth=1,color='grey',markersize=12)
plt.plot(df["年份"],df["金科文化"],label='金科文化',linewidth=1,color='violet',markersize=12)
plt.xlabel("年份")
plt.ylabel('营业总收入')
plt.title("营业收入走势图")
plt.legend()
plt.grid()
plt.show()
年份 奥飞娱乐 万达电影 完美世界 世纪华通 盛讯达 盛天网络 北京文化 \ 0 2012年报 1291.16 3031.11 407.08 928.90 3031.11 170.43 166.11 1 2013年报 1553.01 4022.56 990.09 1227.43 4022.56 213.13 162.86 2 2014年报 2429.67 5338.99 925.78 1712.23 5338.99 231.71 420.69 3 2015年报 2589.17 8000.73 4893.07 3025.83 8000.73 239.79 349.35 4 2016年报 3360.67 11209.32 6158.83 3455.90 11209.32 344.11 926.55 5 2017年报 3642.46 13229.38 7929.82 3490.82 13229.38 388.37 1321.00 6 2018年报 2839.79 16287.42 8033.77 12523.52 16287.42 496.78 741.36 7 2019年报 2726.92 15435.36 8039.02 14689.72 15435.36 660.72 855.34 8 2020年报 2368.20 6295.48 10224.77 14982.97 6295.48 895.88 425.78 9 2021年报 2004.45 9271.48 6739.49 10785.98 9271.48 959.74 158.81 艾格拉斯 华谊兄弟 金科文化 0 308.32 1386.40 464.67 1 505.29 2013.96 480.41 2 375.84 2389.02 476.90 3 489.54 3873.57 506.44 4 562.98 3503.46 894.56 5 845.09 3946.28 1396.27 6 829.29 3814.47 2725.15 7 553.67 2243.55 1943.03 8 181.90 1500.00 1807.22 9 44.92 955.54 1402.05
import pandas as pd
import matplotlib.pyplot as plt
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
from matplotlib import rcParams
rcParams['font.family'] = 'simhei'
data = pd.read_excel('C:/Users/lenovo/Desktop/营业成本.xlsx').set_index(['年份'])
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['奥飞娱乐','万达电影','完美世界','世纪华通','盛讯达','盛天网络','北京文化','艾格拉斯','华谊兄弟','金科文化']
data0=data.iloc[0]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2017年营业成本对比(单位:万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['奥飞娱乐','万达电影','完美世界','世纪华通','盛讯达','盛天网络','北京文化','艾格拉斯','华谊兄弟','金科文化']
data0=data.iloc[1]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2018年营业成本对比(单位:万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name =['奥飞娱乐','万达电影','完美世界','世纪华通','盛讯达','盛天网络','北京文化','艾格拉斯','华谊兄弟','金科文化']
data0=data.iloc[2]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2019年营业成本对比(单位:万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['奥飞娱乐','万达电影','完美世界','世纪华通','盛讯达','盛天网络','北京文化','艾格拉斯','华谊兄弟','金科文化']
data0=data.iloc[3]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2020年营业成本对比(单位:万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name =['奥飞娱乐','万达电影','完美世界','世纪华通','盛讯达','盛天网络','北京文化','艾格拉斯','华谊兄弟','金科文化']
data0=data.iloc[4]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2021年营业成本对比(单位:万元)')
plt.show()