姓名 丰亦成 学号 4201469
import re
import pandas as pd
import os
import openpyxl
import requests
import time
os.chdir('F:\pythonshuju')
xlsx = 'F:\pythonshuju\新能源行业.xlsx'
df = pd.read_excel(xlsx)#读取excel表格中的内容
exf = openpyxl.load_workbook(xlsx)
sheet = exf.active#选择"活跃"(有内容)的表格
C2 = sheet['C2']
C = sheet['C']
links = [c.value for c in C]
links_1 = links[1:-1]
links_2 = ''.join(links_1)
p = re.compile('"(.*?)","(.*?)"')
list_of_tuple = p.findall(links_2)
df2 = pd.DataFrame({'link': [t[0] for t in list_of_tuple],
'f_name': [t[1] for t in list_of_tuple]})
print(df2)
df2.to_csv('新能源行业.csv')
link f_name 0 http://news.windin.com/ns/bulletin.php?code=81... 孚能科技:2020年年度报告全文(修订版) 1 http://news.windin.com/ns/bulletin.php?code=4E... 亿华通:2020年年度报告(修订版) 2 http://news.windin.com/ns/bulletin.php?code=5F... 鸿达兴业:2020年年度报告(更新后) 3 http://news.windin.com/ns/bulletin.php?code=60... 鸿达兴业:2020年年度报告摘要(更新后) 4 http://news.windin.com/ns/bulletin.php?code=F1... 孚能科技:2020年年度报告全文(修订版) .. ... ... 182 http://news.windin.com/ns/bulletin.php?code=9B... 京城股份:2019年年度报告 183 http://news.windin.com/ns/bulletin.php?code=95... 京城股份:2019年年度报告摘要 184 http://news.windin.com/ns/bulletin.php?code=BB... 贝斯特:2019年年度报告 185 http://news.windin.com/ns/bulletin.php?code=B1... 贝斯特:2019年年度报告摘要 186 http://news.windin.com/ns/bulletin.php?code=1C... 大洋电机:2018年年度报告(更新后) [187 rows x 2 columns]
import re
import pandas as pd
import os
df = pd.read_csv("新能源行业.csv",engine = "python",encoding = "utf-8")
p = re.compile("(?<=\d{4})(年报)|(年年报)")
f_names = [p.sub("年年度报告",f) for f in df.f_name]
df["f_name"] = f_names;del p,f_names
print(df)
Unnamed: 0 link \ 0 0 http://news.windin.com/ns/bulletin.php?code=81... 1 1 http://news.windin.com/ns/bulletin.php?code=4E... 2 2 http://news.windin.com/ns/bulletin.php?code=5F... 3 3 http://news.windin.com/ns/bulletin.php?code=60... 4 4 http://news.windin.com/ns/bulletin.php?code=F1... .. ... ... 182 182 http://news.windin.com/ns/bulletin.php?code=9B... 183 183 http://news.windin.com/ns/bulletin.php?code=95... 184 184 http://news.windin.com/ns/bulletin.php?code=BB... 185 185 http://news.windin.com/ns/bulletin.php?code=B1... 186 186 http://news.windin.com/ns/bulletin.php?code=1C... f_name 0 孚能科技:2020年年度报告全文(修订版) 1 亿华通:2020年年度报告(修订版) 2 鸿达兴业:2020年年度报告(更新后) 3 鸿达兴业:2020年年度报告摘要(更新后) 4 孚能科技:2020年年度报告全文(修订版) .. ... 182 京城股份:2019年年度报告 183 京城股份:2019年年度报告摘要 184 贝斯特:2019年年度报告 185 贝斯特:2019年年度报告摘要 186 大洋电机:2018年年度报告(更新后) [187 rows x 3 columns]
def filter_links(words,df,include=True):
ls=[]
for word in words:
if include:
ls.append([word in f for f in df.f_name])
else:
ls.append([word not in f for f in df.f_name])
index = []
for r in range(len(df)):
flag = not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df2 = df[index]
return(df2)
df_all = filter_links(["摘要","问询函","社会责任","审计","财务","风险","债券",],df,include= False)
df_orig = filter_links(["(","("],df_all,include = False)
df_updt = filter_links(["(","("],df_all,include = True)
df_updt = filter_links(["取消"],df_updt,include = False)
print(df_orig)
Unnamed: 0 link \ 6 6 http://news.windin.com/ns/bulletin.php?code=49... 9 9 http://news.windin.com/ns/bulletin.php?code=CB... 11 11 http://news.windin.com/ns/bulletin.php?code=6B... 12 12 http://news.windin.com/ns/bulletin.php?code=C6... 14 14 http://news.windin.com/ns/bulletin.php?code=0A... .. ... ... 175 175 http://news.windin.com/ns/bulletin.php?code=97... 177 177 http://news.windin.com/ns/bulletin.php?code=1C... 179 179 http://news.windin.com/ns/bulletin.php?code=35... 182 182 http://news.windin.com/ns/bulletin.php?code=9B... 184 184 http://news.windin.com/ns/bulletin.php?code=BB... f_name 6 方正电机:2020年年度报告 9 孚能科技:2020年年度报告全文 11 卧龙电驱:2020年年度报告 12 汉钟精机:2020年年度报告 14 亿华通:2020年年度报告全文 .. ... 175 安泰科技:2019年年度报告 177 科达利:2019年年度报告 179 英搏尔:2019年年度报告 182 京城股份:2019年年度报告 184 贝斯特:2019年年度报告 [86 rows x 3 columns]
def sub_with_update(df_updt,df_orig):
df_newest = df_orig.copy()
index_orig=[]
index_updt=[]
for i,f in enumerate(df_orig.f_name):
for j,fn in enumerate(df_updt.f_name):
if f in fn:
index_orig.append(i)
index_updt.append(j)
for n in range(len(index_orig)):
i = index_orig[n]
j = index_updt[n]
df_newest.iloc[i,-2] = df_updt.iloc[j,-2]
return(df_newest)
df_newest = sub_with_update(df_updt,df_orig)
df_newest.sort_values(by = ["f_name"],inplace=True,ignore_index=True)
df_newest["公司简称"] = [f[:4] for f in df_newest.f_name]
print(df_newest)
Unnamed: 0 link \ 0 87 http://news.windin.com/ns/bulletin.php?code=47... 1 85 http://news.windin.com/ns/bulletin.php?code=15... 2 43 http://news.windin.com/ns/bulletin.php?code=8F... 3 143 http://news.windin.com/ns/bulletin.php?code=FD... 4 50 http://news.windin.com/ns/bulletin.php?code=D3... .. ... ... 81 40 http://news.windin.com/ns/bulletin.php?code=1D... 82 130 http://news.windin.com/ns/bulletin.php?code=AF... 83 68 http://news.windin.com/ns/bulletin.php?code=5F... 84 159 http://news.windin.com/ns/bulletin.php?code=6D... 85 16 http://news.windin.com/ns/bulletin.php?code=E6... f_name 公司简称 0 *ST京城:2020年年度报告 *ST京 1 *ST江特:2020年年度报告 *ST江 2 ST电能:2020年年度报告 ST电能 3 中泰股份:2019年年度报告 中泰股份 4 中泰股份:2020年年度报告 中泰股份 .. ... ... 81 雪人股份:2020年年度报告 雪人股份 82 鸿达兴业:2019年年度报告 鸿达兴业 83 鸿达兴业:2020年年度报告 鸿达兴业 84 鹏辉能源:2019年年度报告 鹏辉能源 85 鹏辉能源:2020年年度报告 鹏辉能源 [86 rows x 4 columns]
counts = df_newest["公司简称"].value_counts()
ten_company = []
for cn in counts.index[:10]:
ten_company.append(filter_links([cn],df_newest))
if not os.path.exists("10companies"):
os.makedirs("10companies")
for df_com in ten_company:
cn = df_com["公司简称"].iloc[0]
df_com.to_csv("10companies/%s.csv" % cn)
ten_csv = os.listdir("10companies")
os.chdir(r'F:\pythonshuju\10companies')
f_1=os.listdir(r'F:\pythonshuju\10companies')
links= []
f_names=[]
#links = df['link']; f_names = df['f_name']
for f_2 in f_1:
f_3 = pd.read_csv(f_2)
for link in f_3['link']:
links.append(link)
for f_name in f_3['f_name']:
f_names.append(f_name)
def get_PDF_url(url):
r = requests.get(url);r.encoding = 'utf-8'; html = r.text
r.close() # 已获取html内容,结束connection
p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>', re.DOTALL)
a = p.search(html) # 因第一个<a>即是目标标签,故用search
if a is None:
Warning('没有找到下载链接。请手动检查链接:%s' % url)
return()
else:
href = a.group(1); fname = a.group(2).strip()
href = r.url[:26] + href # 形成完整的链接
return((href,fname))
hrefs=[];fnames=[]
for link in links:
href,fname = get_PDF_url(link)
hrefs.append(href)
fnames.append(fname)
time.sleep(10)
df_final_links=pd.DataFrame({'href':hrefs,'fname':fnames})
df_final_links=pd.DataFrame({'href':hrefs,'fname':fnames})
df_final_links.to_csv('links新能源行业.csv')
df_final_links=pd.read_csv('links新能源行业.csv')
f_names=df_final_links['fname']
hrefs=df_final_links['href']
for i in range(len(hrefs)):
href=hrefs[i];f_name=f_names[i]
r = requests.get(href, allow_redirects=True)
open('%s' %f_name, 'wb').write(r.content)
time.sleep(10)
r.close()
import matplotlib.pyplot as plt
import pandas as pd
import csv
csvFile = open(r'F:\pythonshuju\10companies\新能源企业营收.csv')
reader = csv.reader(csvFile)
ls = []
for item in reader:
ls.append(item)
csvFile.close()
df = pd.DataFrame(data=ls[1:], columns=ls[0])
df.index=pd.to_datetime(df.iloc[:,0])
df=df.astype('float')
%matplotlib inline
plt.rcParams['font.sans-serif']=['SimHei'] #确保显示中文
plt.rcParams['axes.unicode_minus'] = False #确保显示负数的参数设置
plt.plot(df.index,df['宁德时代'],lw=2)
plt.xlabel('年度')
plt.ylabel('营业收入')
plt.title('数据中新能源企业营收折线图')
plt.grid(True,axis='both')