import requests
import re
import pandas as pd
import os
import fitz
import time
import pdfplumber
from pandas.core.frame import DataFrame
import matplotlib.pyplot as plt
xlsx = '中药行业.xlsx'
df = pd.read_excel(xlsx)
exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']
links = [c.value for c in C]
links_1 = links[1:-1]
links_2 = ''.join(links_1)
sample = '=HYPERLINK("http://news.windin.com/ns/bulletin.php?code=3E3A94A9DFF0&id=125171528&type=1","吉药控股:2019年年度报告(更新后)")'
p = re.compile('"(.*?)","(.*?)"')
list_of_tuple = p.findall(links_2)
df2 = pd.DataFrame({'link':[t[0] for t in list_of_tuple], 'f_name':[t[1] for t in list_of_tuple]})
df2.to_csv('中药行业.csv')
df = pd.read_csv('中药行业.csv')
#标准化年报文件名
p = re.compile('(?<=\d{4}(年度))')
f_names = [p.sub('年年度报告', f) for f in df.f_name]
df['f_name'] = f_names; del p,f_names
def filter_links(words,df,include=True):
ls = []
for word in words:
if include:
ls.append([word in f for f in df.f_name])
else:
ls.append([word not in f for f in df.f_name])
idnex = []
for r in range(len(df)):
flag = not include
for c in range(len(words)):
if include:
ls.append([word not in f for f in df.f_name])
index=[]
for r in range(len(df)):
flag=not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df2=df[index]
return(df2)
df_all = filter_links(['摘要','问询函','审计','财务','风险'],df,include=[False])
df_orig = filter_links(['(','('],df_all,include=[False])
df_updt = filter_links(['(','(',],df_all,include=[True])
df_updt = filter_links(['取消'], df_updt,include=[False])
def sub_with_update(df_updt,df_orig):
df_newest = df_orig.copy()
index_orig=[]
index_updt=[]
for i,f in enumerate(df_orig.f_name):
for j,fn in enumerate(df_updt.f_name):
if f in fn:
index_orig.append(i)
index_updt.append(j)
#return ((index_orig,index_updt))
for n in range(len(index_orig)):
i = idnex_orig[n]
j = index_updt[n]
df_orig.iloc[i,-2] = df_updt.iloc[j,-2]
#df_newest.iloc[i,-1] = df_updt.i;oc[j,-1]
return(df_newest)
df_newest = sub_with_update(df_updt,df_orig)
# index_orig,index_updt = sub_with_update(df_updt, df_orig)
df_all.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]
counts = df_newest['公司简称'].value_counts()
ten_company = []
for cn in counts.index[:10]:
ten_company.append(filter_links([cn],df_newest))
if not os.path.exists('10companies'):
os.makedirs('10companies')
for df_com in ten_company:
cn=df_com['公司简称'].iloc[0]
df_com.to_csv('10companies/%s.csv' % cn)
ten_csv=os.listdir('10companies')
os.chdir('D:/Anaconda/10companies')
f_1=os.listdir()
f_1.remove(f_1[0])
f_1.remove(f_1[4])
links= []
f_names=[]
links = df['link']; f_names = df['f_name']
for f_2 in f_1:
f_3 = pd.read_csv(f_2)
for link in f_3['link']:
links.append(link)
for f_name in f_3['f_name']:
f_names.append(f_name)
def get_PDF_url(url):
r = requests.get(url);r.encoding = 'utf-8'; html = r.text
r.close() # 已获取html内容,结束connection
p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>', re.DOTALL)
a = p.search(html) # 因第一个<a>即是目标标签,故用search
if a is None:
Warning('没有找到下载链接。请手动检查链接:%s' % url)
return()
else:
href = a.group(1); fname = a.group(2).strip()
href = r.url[:26] + href # 形成完整的链接
return((href,fname))
hrefs=[];fnames=[]
for link in links:
href,fname = get_PDF_url(link)
hrefs.append(href)
fnames.append(fname)
df_final_links=pd.DataFrame({'href':hrefs,'fname':fnames})
df_final_links.to_csv('中药links.csv')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-24bdac9a51d6> in <module> ----> 1 os.chdir('D:/Anaconda/10companies') 2 f_1=os.listdir() 3 f_1.remove(f_1[0]) 4 f_1.remove(f_1[4]) 5 NameError: name 'os' is not defined
df_final_links=pd.read_csv('D:/Anaconda/10companies/中药links.csv')
f_names=df_final_links['fname']
hrefs=df_final_links['href']
for i in range(len(hrefs)):
href=hrefs[i];f_name=f_names[i]
r = requests.get(href, allow_redirects=True)
open('%s' %f_name, 'wb').write(r.content)
time.sleep(10)
r.close()
path = 'D:/Anaconda/10companies/600129太极集团2020年年度报告摘要.pdf'
pdf = pdfplumber.open(path)
for page in pdf.pages:
for table in page.extract_tables():
# print(table)
for row in table:
print(row)
print('---------- 分割线 ----------')
pdf.close()
file_dir=r'D:/Anaconda/10companies'
file_list=[]
for files in os.walk(file_dir):
for file in files[2]:
if os.path.splitext(file)[1]=='.pdf' or os.path.splitext(file)[1]=='.PDF':
file_list.append(file_dir+'\\'+file)
for i in range(len(file_list)):
pdf=pdfplumber.open(file_list[i])
for page in pdf.pages:
# print(page.extract_text())
for pdf_table in page.extract_tables():
table = []
cells = []
for row in pdf_table:
if not any(row):
# 如果一行全为空,则视为一条记录结束
if any(cells):
table.append(cells)
cells = []
elif all(row):
# 如果一行全不为空,则本条为新行,上一条结束
if any(cells):
table.append(cells)
cells = []
table.append(row)
else:
if len(cells) == 0:
cells = row
else:
for i in range(len(row)):
if row[i] is not None:
cells[i] = row[i] if cells[i] is None else cells[i] + row[i]
for row in table:
print([re.sub('\s+', '', cell) if cell is not None else None for cell in row])
print('---------- 分割线 ----------')
pdf.close()
data=pd.read_excel('D:/Anaconda/10companies/data/data.xlsx')
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文(windows)
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
fig = plt.figure(figsize=(15,9), dpi=100)
ax = fig.add_subplot(111)
%matplotlib inline
data=pd.read_excel('D:/Anaconda/10companies/data/data.xlsx')
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文(windows)
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
fig = plt.figure(figsize=(15,9), dpi=100)
ax = fig.add_subplot(111)
ax.plot(data['date'],data['康美'],color='r', label='康美药业')
ax.plot(data['date'],data['辅仁'],color='y', label='辅仁药业')
ax.plot(data['date'],data['天目'],color='green', label='天目药业')
ax.plot(data['date'],data['羚锐'],color='black', label='羚锐药业')
ax.plot(data['date'],data['片仔癀'],color='grey', label='片仔癀药业')
ax.plot(data['date'],data['神奇制药'],color='pink', label='神奇制药')
ax.plot(data['date'],data['太极'],color='Navy', label='太极药业')
ax.plot(data['date'],data['太龙'],color='Gold', label='太龙药业')
ax.plot(data['date'],data['天士力'],color='Orange', label='天士力药业')
ax.plot(data['date'],data['亚宝'],color='Maroon', label='亚宝药业')
plt.xlabel('年份', fontsize=14) # X轴标签
plt.ylabel("百万", fontsize=16) # Y轴标签
ax.legend() # 图例
plt.title("营业收入", fontsize=25, color='black', pad=20)
plt.gcf().autofmt_xdate()
plt.show()
data=pd.read_excel('D:/Anaconda/10companies/data/data.xlsx').set_index(['date'])
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['康美药业','辅仁药业','天目药业','羚锐药业','片仔癀药业','神奇制药', '太极药业', '太龙药业', '天士力药业', '亚宝药业']
data0=data.iloc[8]
plt.barh(range(len(data0)), data0, tick_label=companies_name, color='#6699CC')
plt.title('2014年营业收入对比(单位:百万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['康美药业','辅仁药业','天目药业','羚锐药业','片仔癀药业','神奇制药', '太极药业', '太龙药业', '天士力药业', '亚宝药业']
data1=data.iloc[9]
plt.barh(range(len(data1)), data1, tick_label=companies_name, color='#6699CC')
plt.title('2013年营业收入对比(单位:百万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['康美药业','辅仁药业','天目药业','羚锐药业','片仔癀药业','神奇制药', '太极药业', '太龙药业', '天士力药业', '亚宝药业']
data2=data.iloc[7]
plt.barh(range(len(data2)), data2, tick_label=companies_name, color='#6699CC')
plt.title('2015年营业收入对比(单位:百万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['康美药业','辅仁药业','天目药业','羚锐药业','片仔癀药业','神奇制药', '太极药业', '太龙药业', '天士力药业', '亚宝药业']
data3=data.iloc[6]
plt.barh(range(len(data3)), data3 , tick_label=companies_name, color='#6699CC')
plt.title('2016年营业收入对比(单位:百万元)')
plt.show()
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
companies_name = ['康美药业','辅仁药业','天目药业','羚锐药业','片仔癀药业','神奇制药', '太极药业', '太龙药业', '天士力药业', '亚宝药业']
data4=data.iloc[5]
plt.barh(range(len(data4)), data4, tick_label=companies_name, color='#6699CC')
plt.title('2017年营业收入对比(单位:百万元)')
plt.show()