import pandas as pd
import openpyxl
import re
xlsx = 'D:/anaconda/10companies/航空航天行业.xlsx'
df = pd.read_excel(xlsx)
exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']
links = [c.value for c in C]
links_1 = links[1:-1]
links_2 = ''.join(links_1)
sample = '=HYPERLINK("http://news.windin.com/ns/bulletin.php?code=382A6978A76C&id=123424374&type=1","炼石航空:2020年年度报告摘要")'
p = re.compile('"(.*?)","(.*?)"')
list_of_tuple = p.findall(links_2)
df2 = pd.DataFrame({'link':[t[0] for t in list_of_tuple], 'f_name':[t[1] for t in list_of_tuple]})
df2.to_csv('航空航天行业.csv')
import requests
import re
import pandas as pd
import os
import fitz
import time
df = pd.read_csv('D:/anaconda/10companies/航空航天行业.csv', encoding='utf-8')
p = re.compile('(?<=\d{4}(年度))')
f_names = [p.sub('年年度报告', f) for f in df.f_name]
df['f_name'] = f_names; del p,f_names
def filter_links(words,df,include=True):
ls = []
for word in words:
if include:
ls.append([word in f for f in df.f_name])
else:
ls.append([word not in f for f in df.f_name])
index = []
for r in range(len(df)):
flag = not include
for c in range(len(words)):
if include:
ls.append([word not in f for f in df.f_name])
index=[]
for r in range(len(df)):
flag=not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df2=df[index]
return(df2)
df_all = filter_links(['摘要','问询函','社会责任','审计','财务','风险','债券'],df,include=[False])
df_orig = filter_links(['(','('],df_all,include=[False])
df_updt = filter_links(['(','('],df_all,include=[True])
df_updt = filter_links(['取消'],df_updt,include=[False])
def sub_with_update(df_updt,df_orig):
df_newest = df_orig.copy()
index_orig=[]
index_updt=[]
for i,f in enumerate(df_orig.f_name):
for j,fn in enumerate(df_updt.f_name):
if f in fn:
index_orig.append(i)
index_updt.append(j)
for n in range(len(index_orig)):
i = index_orig[n]
j = index_updt[n]
df_orig.iloc[i,-2] = df_updt.iloc[j,-2]
return(df_newest)
df_newest = sub_with_update(df_updt,df_orig)
df_all.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]
counts = df_newest['公司简称'].value_counts()
ten_company = []
for cn in counts.index[:10]:
ten_company.append(filter_links([cn],df_newest))
if not os.path.exists('10companies'):
os.makedirs('10companies')
for df_com in ten_company:
cn=df_com['公司简称'].iloc[0]
df_com.to_csv('10companies/%s.csv' % cn)
ten_csv=os.listdir('10companies')
os.chdir('D:/anaconda/10companies')
f_1=os.listdir()
links = []
f_names = []
def get_PDF_url(url):
r = requests.get(url);r.encoding = 'utf-8'; html = r.text
r.close()
p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>', re.DOTALL)
a = p.search(html)
if a is None:
Warning('没有找到下载链接。请手动检查链接:%s' % url)
return()
else:
href = a.group(1); fname = a.group(2).strip()
href = r.url[:26] + href
return((href,fname))
hrefs=[];fnames=[]
for link in links:
href,fname = get_PDF_url(link)
hrefs.append(href)
fnames.append(fname)
df_final_links=pd.DataFrame({'href':hrefs,'fname':fnames})
df_final_links.to_csv('D:/anaconda/10companies/航空航天links.csv')
df_final_links=pd.read_csv('D:/anaconda/10companies/航空航天links.csv')
f_names=df_final_links['fname']
hrefs=df_final_links['href']
for i in range(len(hrefs)):
href=hrefs[i];f_name=f_names[i]
r = requests.get(href, allow_redirects=True)
open('%s' %f_name, 'wb').write(r.content)
time.sleep(10)
r.close()
<ipython-input-5-1e419d1a2319>:59: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_all.sort_values(by=['f_name'],inplace=True,ignore_index=True)
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) <ipython-input-5-1e419d1a2319> in <module> 104 df_final_links.to_csv('D:/anaconda/10companies/航空航天links.csv') 105 --> 106 df_final_links=pd.read_csv('D:/anaconda/10companies/航空航天links.csv') 107 f_names=df_final_links['fname'] 108 hrefs=df_final_links['href'] ~\anaconda3\lib\site-packages\pandas\io\parsers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision) 684 ) 685 --> 686 return _read(filepath_or_buffer, kwds) 687 688 ~\anaconda3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds) 450 451 # Create the parser. --> 452 parser = TextFileReader(fp_or_buf, **kwds) 453 454 if chunksize or iterator: ~\anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds) 944 self.options["has_index_names"] = kwds["has_index_names"] 945 --> 946 self._make_engine(self.engine) 947 948 def close(self): ~\anaconda3\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine) 1176 def _make_engine(self, engine="c"): 1177 if engine == "c": -> 1178 self._engine = CParserWrapper(self.f, **self.options) 1179 else: 1180 if engine == "python": ~\anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds) 2006 kwds["usecols"] = self.usecols 2007 -> 2008 self._reader = parsers.TextReader(src, **kwds) 2009 self.unnamed_cols = self._reader.unnamed_cols 2010 pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.__cinit__() pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source() FileNotFoundError: [Errno 2] No such file or directory: 'D:/anaconda/10companies/航空航天links.csv'
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline
rcParams['font.family'] = 'simhei'
df = pd.read_excel("D:/anaconda/营业收入数据表.xlsx")
print(df)
plt.plot(df["年份"],df["航天动力"],label='航天动力',linewidth=1,color='blue',markersize=12)
plt.plot(df["年份"],df["爱乐达"],label='爱乐达',linewidth=1,color='darkgrey',markersize=12)
plt.plot(df["年份"],df["中国卫通"],label='中国卫通',linewidth=1,color='azure',markersize=12)
plt.plot(df["年份"],df["天奥电子"],label='天奥电子',linewidth=1,color='beige',markersize=12)
plt.plot(df["年份"],df["纵横股份"],label='纵横股份',linewidth=1,color='green',markersize=12)
plt.plot(df["年份"],df["三角防务"],label='三角防务',linewidth=1,color='coral',markersize=12)
plt.plot(df["年份"],df["中航光电"],label='中航光电',linewidth=1,color='cyan',markersize=12)
plt.plot(df["年份"],df["中国卫星"],label='中国卫星',linewidth=1,color='gold',markersize=12)
plt.plot(df["年份"],df["炼石航空"],label='炼石航空',linewidth=1,color='grey',markersize=12)
plt.plot(df["年份"],df["航天发展"],label='航天发展',linewidth=1,color='violet',markersize=12)
plt.xlabel("年份")
plt.ylabel('营业总收入')
plt.title("营业收入走势图")
plt.legend()
plt.grid()
年份 爱乐达 航天动力 航天发展 炼石航空 三角防务 天奥电子 中国卫通 中国卫星 \ 0 2021 380.91 785.81 3206.85 717.22 765.50 548.94 1850.44 4256.26 1 2020 303.79 1249.02 4436.05 1131.48 614.85 916.39 2710.30 7007.40 2 2019 184.24 2651.51 4039.26 1979.86 613.88 866.91 2734.19 6463.26 3 2018 128.15 1887.77 3515.80 1602.52 465.72 864.01 2693.99 7583.02 4 2017 135.98 1840.53 2490.90 752.74 374.76 820.12 2620.83 7385.04 5 2016 118.92 1803.67 2042.10 12.65 298.24 761.31 2475.94 6337.26 6 2015 104.90 1503.36 1347.50 154.96 NaN 703.24 NaN 5448.38 7 2014 64.99 1310.62 364.70 243.74 NaN NaN NaN 4664.10 8 2013 NaN 1386.72 536.15 226.20 NaN NaN NaN 4803.53 9 2012 NaN 1247.42 423.56 212.03 NaN NaN NaN 4260.89 中航光电 纵横股份 0 9876.76 123.76 1 10305.22 271.81 2 9158.83 210.76 3 7816.02 116.57 4 6361.81 103.39 5 5854.80 NaN 6 4725.20 NaN 7 3491.25 NaN 8 2601.76 NaN 9 2203.48 NaN