import html2text import re import pandas as pd with open('../jxufetext.html', 'r', encoding='utf-8') as f: text = f.read() text2 = re.compile('<.*?>(.*)<.*?>').findall(text) #注意到有很多空格,删去并获得列表 #text2 = [i for i in text2 if i!=''] #提取标签内的文字 #word2 = re.compile('(.*?)').findall(text2) word = [] for i in text2: if "(.*)<.*?>').findall(i) for i in wordlist1: word.append(i) elif "

" in i: wordlist2 = re.compile('(.*)

(.*)').findall(i) for i in wordlist2: word.append(i) elif "" in i: wordlist3 = re.compile('(.*)').findall(i) for i in wordlist3: word.append(i) elif "