import pandas as pd
import openpyxl
import re

xlsx = 'D:/anaconda/10companies/航空航天行业.xlsx'

df = pd.read_excel(xlsx)

exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']

links = [c.value for c in C]

links_1 = links[1:-1]
links_2 = ''.join(links_1)

sample  = '=HYPERLINK("http://news.windin.com/ns/bulletin.php?code=382A6978A76C&id=123424374&type=1","炼石航空:2020年年度报告摘要")'

p = re.compile('"(.*?)","(.*?)"')
list_of_tuple = p.findall(links_2)

df2 = pd.DataFrame({'link':[t[0] for t in list_of_tuple], 'f_name':[t[1] for t in list_of_tuple]})

df2.to_csv('航空航天行业.csv')


import requests
import re 
import pandas as pd
import os
import fitz
import time

df = pd.read_csv('D:/anaconda/10companies/航空航天行业.csv', encoding='utf-8')
p = re.compile('(?<=\d{4}(年度))')
f_names = [p.sub('年年度报告', f) for f in df.f_name]
df['f_name'] = f_names; del p,f_names

def filter_links(words,df,include=True):
    ls = []
    for word in words:
        if include:
            ls.append([word in f for f in df.f_name])
        else:
            ls.append([word not in f for f in df.f_name])
    index = []
    for r in range(len(df)):
        flag  = not include
        for c in range(len(words)):
            if include:
                ls.append([word not in f for f in df.f_name])
                index=[]
                for r in range(len(df)):
                    flag=not include
                    for c in range(len(words)):
                        if include:
                            flag = flag or ls[c][r]
                        else:
                            flag = flag and ls[c][r]
                    index.append(flag)
                df2=df[index]
                return(df2)
                
df_all = filter_links(['摘要','问询函','社会责任','审计','财务','风险','债券'],df,include=[False])
df_orig = filter_links(['(','('],df_all,include=[False])
df_updt = filter_links(['(','('],df_all,include=[True])
df_updt = filter_links(['取消'],df_updt,include=[False])

def sub_with_update(df_updt,df_orig):
    df_newest = df_orig.copy()
    index_orig=[]
    index_updt=[]
    for i,f in enumerate(df_orig.f_name):
        for j,fn in enumerate(df_updt.f_name):
            if f in fn:
                index_orig.append(i)
                index_updt.append(j)
    for n in range(len(index_orig)):
        i = index_orig[n]
        j = index_updt[n]
        df_orig.iloc[i,-2] = df_updt.iloc[j,-2]
    return(df_newest)

df_newest  = sub_with_update(df_updt,df_orig)
df_all.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]

counts = df_newest['公司简称'].value_counts()

ten_company = []

for cn in counts.index[:10]:
    ten_company.append(filter_links([cn],df_newest))
    
if not os.path.exists('10companies'):
    os.makedirs('10companies')
    
for df_com in ten_company:
    cn=df_com['公司简称'].iloc[0]
    df_com.to_csv('10companies/%s.csv' % cn)
    
ten_csv=os.listdir('10companies')
    
os.chdir('D:/anaconda/10companies')
f_1=os.listdir()

links = []
f_names = []
  
def get_PDF_url(url):
    r = requests.get(url);r.encoding = 'utf-8'; html = r.text
    r.close() 
    p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>', re.DOTALL)
    a = p.search(html) 
    if a is None:
        Warning('没有找到下载链接。请手动检查链接：%s' % url)
        return()
    else:
        href = a.group(1); fname = a.group(2).strip()
    href = r.url[:26] + href 
    return((href,fname))

hrefs=[];fnames=[]

for link in links:
    href,fname = get_PDF_url(link)
    hrefs.append(href)
    fnames.append(fname)
    df_final_links=pd.DataFrame({'href':hrefs,'fname':fnames})
    df_final_links.to_csv('D:/anaconda/10companies/航空航天links.csv')
    
df_final_links=pd.read_csv('D:/anaconda/10companies/航空航天links.csv')
f_names=df_final_links['fname']
hrefs=df_final_links['href']
for i in range(len(hrefs)):
    href=hrefs[i];f_name=f_names[i]
    r = requests.get(href, allow_redirects=True)
    open('%s' %f_name, 'wb').write(r.content)
    time.sleep(10)
r.close()

<ipython-input-5-1e419d1a2319>:59: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all.sort_values(by=['f_name'],inplace=True,ignore_index=True)

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-5-1e419d1a2319> in <module>
    104     df_final_links.to_csv('D:/anaconda/10companies/航空航天links.csv')
    105 
--> 106 df_final_links=pd.read_csv('D:/anaconda/10companies/航空航天links.csv')
    107 f_names=df_final_links['fname']
    108 hrefs=df_final_links['href']

~\anaconda3\lib\site-packages\pandas\io\parsers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    684     )
    685 
--> 686     return _read(filepath_or_buffer, kwds)
    687 
    688 

~\anaconda3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
    450 
    451     # Create the parser.
--> 452     parser = TextFileReader(fp_or_buf, **kwds)
    453 
    454     if chunksize or iterator:

~\anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
    944             self.options["has_index_names"] = kwds["has_index_names"]
    945 
--> 946         self._make_engine(self.engine)
    947 
    948     def close(self):

~\anaconda3\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
   1176     def _make_engine(self, engine="c"):
   1177         if engine == "c":
-> 1178             self._engine = CParserWrapper(self.f, **self.options)
   1179         else:
   1180             if engine == "python":

~\anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds)
   2006         kwds["usecols"] = self.usecols
   2007 
-> 2008         self._reader = parsers.TextReader(src, **kwds)
   2009         self.unnamed_cols = self._reader.unnamed_cols
   2010 

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()

FileNotFoundError: [Errno 2] No such file or directory: 'D:/anaconda/10companies/航空航天links.csv'


import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline
rcParams['font.family'] = 'simhei'

df = pd.read_excel("D:/anaconda/营业收入数据表.xlsx")
print(df)

plt.plot(df["年份"],df["航天动力"],label='航天动力',linewidth=1,color='blue',markersize=12)
plt.plot(df["年份"],df["爱乐达"],label='爱乐达',linewidth=1,color='darkgrey',markersize=12)
plt.plot(df["年份"],df["中国卫通"],label='中国卫通',linewidth=1,color='azure',markersize=12)
plt.plot(df["年份"],df["天奥电子"],label='天奥电子',linewidth=1,color='beige',markersize=12)
plt.plot(df["年份"],df["纵横股份"],label='纵横股份',linewidth=1,color='green',markersize=12)
plt.plot(df["年份"],df["三角防务"],label='三角防务',linewidth=1,color='coral',markersize=12)
plt.plot(df["年份"],df["中航光电"],label='中航光电',linewidth=1,color='cyan',markersize=12)
plt.plot(df["年份"],df["中国卫星"],label='中国卫星',linewidth=1,color='gold',markersize=12)
plt.plot(df["年份"],df["炼石航空"],label='炼石航空',linewidth=1,color='grey',markersize=12)
plt.plot(df["年份"],df["航天发展"],label='航天发展',linewidth=1,color='violet',markersize=12)

plt.xlabel("年份")
plt.ylabel('营业总收入')
plt.title("营业收入走势图")
plt.legend()
plt.grid()

     年份     爱乐达     航天动力     航天发展     炼石航空    三角防务    天奥电子     中国卫通     中国卫星  \
0  2021  380.91   785.81  3206.85   717.22  765.50  548.94  1850.44  4256.26   
1  2020  303.79  1249.02  4436.05  1131.48  614.85  916.39  2710.30  7007.40   
2  2019  184.24  2651.51  4039.26  1979.86  613.88  866.91  2734.19  6463.26   
3  2018  128.15  1887.77  3515.80  1602.52  465.72  864.01  2693.99  7583.02   
4  2017  135.98  1840.53  2490.90   752.74  374.76  820.12  2620.83  7385.04   
5  2016  118.92  1803.67  2042.10    12.65  298.24  761.31  2475.94  6337.26   
6  2015  104.90  1503.36  1347.50   154.96     NaN  703.24      NaN  5448.38   
7  2014   64.99  1310.62   364.70   243.74     NaN     NaN      NaN  4664.10   
8  2013     NaN  1386.72   536.15   226.20     NaN     NaN      NaN  4803.53   
9  2012     NaN  1247.42   423.56   212.03     NaN     NaN      NaN  4260.89   

       中航光电    纵横股份  
0   9876.76  123.76  
1  10305.22  271.81  
2   9158.83  210.76  
3   7816.02  116.57  
4   6361.81  103.39  
5   5854.80     NaN  
6   4725.20     NaN  
7   3491.25     NaN  
8   2601.76     NaN  
9   2203.48     NaN

实验报告¶

第一步转换为csv格式文件¶

第二步读取csv文件，并筛选¶

¶

第三步绘制图表¶

实验报告¶

第一步 转换为csv格式文件¶

第二步 读取csv文件，并筛选¶

¶

第三步 绘制图表¶

第一步转换为csv格式文件¶

第二步读取csv文件，并筛选¶

第三步绘制图表¶