import re
import pandas as pd
import openpyxl
import os
os.chdir(r"/Users/caizuguang/Desktop/final_data")
xlsx = '新能源行业.xlsx'
df = pd.read_excel(xlsx)
exf = openpyxl.load_workbook(xlsx) #
sheet = exf.active #第一个表默认为extive
C2 = sheet['C2'] #c列的第3行
C = sheet['C'] #c列
links = [c.value for c in C]
links_1 = links[1:-1]
links_2 = ''.join(links_1)
#sample =
p = re.compile('"(.*?)","(.*?)"')
list_of_tuple = p.findall(links_2)
df2 = pd.DataFrame({'link':[t[0] for t in list_of_tuple],
'f_name':[t[1]for t in list_of_tuple]})
df2.to_csv('新能源行业.csv')
import pandas as pd
import re
import requests
import os
import time
os.chdir(r"/Users/caizuguang/Desktop/final_data")
df = pd.read_csv('新能源行业.csv')
#p = re.compile('(?<=\d{4}(年报)|(年年报)')
def filter_links(words,df,include = True): #筛选正确的链接
ls =[]
for word in words:
if include:
ls.append([word in f for f in df.f_name])
else:
ls.append([word not in f for f in df.f_name])
index = []
for r in range(len(df)):
flag = not include
for c in range(len(words)):
if include:
flag = flag or ls[c][r]
else:
flag = flag and ls[c][r]
index.append(flag)
df2 = df[index]
return(df2)
df_all = filter_links(['摘要','审计','财务'],df, False)
df_orig = filter_links(['(','('], df_all,include=False)
df_updt = filter_links(['(','('], df_all,include=True)
def sub_with_update(df_updt,df_orig):
index_orig = []#;i =0
index_updt =[]#;j = 0
for i, f in enumerate(df_orig.f_name):#返回他的下角标,以及值
for j, fn in enumerate(df_updt.f_name):
if f in fn:
index_orig.append(i)
index_updt.append(j)
for n in range(len(index_orig)):
i = index_orig[n]
j = index_updt[n]
df_orig.iloc[i,-2] = df_updt.iloc[j,-2]
df_orig.iloc[i,-1] = df_updt.iloc[j,-1]
return(df_orig)
df_newest = sub_with_update(df_updt,df_orig)
df_newest.sort_values(by = ['f_name'],
inplace = True,
ignore_index = True)
df_newest['公司简称'] = df_newest['f_name'].str[:4]
counts = df_newest['公司简称'].value_counts()
ten_company = []
for cn in counts.index[:10]:
ten_company.append(filter_links([cn],df_newest))
if not os.path.exists('10companies'):
os.makedirs('10companies')
#os.chdir(r"/Users/caizuguang/Desktop/finacedata/10companies")
for df_com in ten_company:
cn = df_com['公司简称'].iloc[0]
df_com.to_csv('%s.csv' %cn )
import pandas as pd
import re
import requests
import os
import time
os.chdir(r"/Users/caizuguang/Desktop/final_data/10companies")
file_1 = os.listdir()
#file_1中含有要下载的链接
file_1.remove(file_1[2])
links = []
f_names = []
for file_2 in file_1:
file_3 = pd.read_csv(file_2)
for link in file_3['link']:
links.append(link)
for f_name in file_3['f_name']:
f_names.append(f_name)
#links = df['link'];f_names = df['f_name']
url = 'http://news.windin.com/ns/bulletin.php?code=818C2D61C901&id=124505628&type=1'
def get_PDF_url(url):
r = requests.get(url); r.encoding = 'utf-8'; html = r.text
r.close()# 已获取html内容,结束connection
p = re.compile('<a href=(.*?)\s.*?>(.*?)</a>',re.DOTALL)
a = p.search(html) #因为第一个<a>即是目标标签,故用search
if a is None:
Warning('没有找到下载链接,请手动下载:%s' % url)
return()
else:
href = a.group(1); fname = a.group(2).strip()
href = r.url[:26]+href #形成完整的链接
return((href,fname))
for link in links:
href,fname = get_PDF_url(link)
r = requests.get(href,allow_redirects = True)
open('%s'%fname,'wb').write(r.content)
time.sleep(10)
r.close()
href ,fname = get_PDF_url(links[0])
r = requests.get(href, allow_redirects = True)
open('%s' %fname,'wh').write(r.content)
import pandas as pd
import re
import requests
import os
import time
os.chdir(r"/Users/caizuguang/Desktop/final_data/10companies")
file_1 = os.listdir()
links = []
f_names = []
for file_2 in file_1:
file_3 = pd.read_csv(file_2)
for link in file_3['link']:
links.append(link)
for f_name in file_3['f_name']:
f_names.append(f_name)
import pandas as pd
import re
import requests
import os
import time
os.chdir(r"/Users/caizuguang/Desktop/final_data/10companies")
file_1 = os.listdir()
#file_1中含有要下载的链接
file_1.remove(file_1[2])
links = []
f_names = []
for file_2 in file_1:
file_3 = pd.read_csv(file_2)
for link in file_3['link']:
links.append(link)
for f_name in file_3['f_name']:
f_names.append(f_name)
f_names
1.成分及权重分析
import tushare as ts
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import matplotlib.dates as mdate
import os
os.chdir('/Users/caizuguang/Desktop/新能源行业分析')
new_energy = pd.read_excel('新能源行业数据.xlsx',index_col = 0).dropna(axis =1)
new_energy.head(10)# 查看前10行的数据 12月31日数据
1.对新能源行业中起上市板、中信三级行业分别进行统计并可视化,代码及展示如下:
df0 = new_energy['中信三级行业'].value_counts()
df0 = pd.DataFrame(df0)
df0
plt.figure(figsize = (12,6))
plt.bar(df0.index,df0['中信三级行业'])
plt.title('新能源行业细分行业统计',fontsize = 12)
plt.xticks(rotation =30)
for x, y in enumerate(df0['中信三级行业']):# 将数据值标注在柱形上方
plt.text(x, y+0.05, y, ha='center', fontsize=10) # 比y值高0.03的位置
plt.xlabel('中信三级行业', fontsize = 16,color = 'r')
plt.ylabel('数量',fontsize = 16, color = 'r')
plt.show()
从以上图中可以看到,新能源下游产业数量较多,太阳能、风能各占9家,其他细分行业也如上图展示。
2、新能源行业上市板统计:
df1 = new_energy['上市板'].value_counts()
df1 = pd.DataFrame(df1)
df1
plt.pie(df1['上市板'],labels = df1.index,autopct="%0.2f%%")
plt.show()
我们从上图可以看到,该指数的上市公司上市板所占比例如上图,符合常识:上证主板上市最多。
2.指数数据分析
import tushare as ts
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import matplotlib.dates as mdate
import os
os.chdir('/Users/caizuguang/Desktop/新能源行业分析')
new_energy_index = pd.read_excel('指数数据.xlsx',index_col= 0 ).dropna(axis =1)
new_energy_index.head(10)# 查看前10行的数据 12月31日数据
以收盘价为准,绘制其历年走势图,用以反映新能源行业发展情况
#转换为时间序列
new_energy_index.index = pd.to_datetime(new_energy_index.index)
plt.figure(figsize = (10,5))
plt.title(u'新能源 (000941.CSI)走势图')
plt.plot(new_energy_index['收盘价'], lw=2,label = '收盘价',color = 'r')
plt.xticks(fontsize = 13 ,rotation =30)
plt.xlabel(u'日期',fontsize = 13)
plt.ylabel(u'收盘价',fontsize = 13,rotation = 90)
plt.legend()
plt.grid()
plt.show()
都说股市时经济的晴雨表,我们可以看到近来新能源行业发展迅速。
以下是现金分红统计分析
import tushare as ts
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import matplotlib.dates as mdate
import os
os.chdir('/Users/caizuguang/Desktop/新能源行业分析')
profit_share= pd.read_excel('现金分红统计.xlsx',index_col= 0 ).dropna()
profit_share
plt.figure(figsize = (10,6))
plt.title(u'历年现金分红')
profit_share['现金分红总额(亿元)\n'].plot(color = 'y')
plt.bar(profit_share.index,profit_share['现金分红总额(亿元)\n'])
plt.ylabel('亿元',fontsize = 13)
plt.show()
基本上历年现金分红越来越高