获取网页中所有的文字

# encoding=utf8
 
import sys
 
reload(sys)
 
sys.setdefaultencoding('utf8')
 
import re
import requests
from bs4 import BeautifulSoup
 
 
html = requests.get('https://mp.weixin.qq.com/s?src=11&timestamp=1533887718&ver=1051&signature=Xszdx5nmmHyebcH0MXxyHi7-jDwGoNDUDXCHJzPVic68tXGRSTiM3CStUDfSR*aALaC3nK3Ez4e33uLR5ir1pLgy3vEvWXWOvVXgAbsXMn5fB-HWboOW26GH*KMRVhgX&new=1')
soup = BeautifulSoup(html.text, "html5lib")
data = soup.findAll(text=True)
 
 
def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element.encode('utf-8'))):
        return False
    return True
 
 
result = filter(visible, data)
 
with open('res.txt', "w+") as p:
    for i in result:
        print(str(i))
        p.write(str(i))
 
 
print list(result)

You may also like

发表评论

电子邮件地址不会被公开。 必填项已用*标注