答案:from itertools import chain
from urllib.request import urlopen
def getPageHtml(url):
#获取网页的源码文件
obj = urlopen(url)
return obj.read().decode('utf-8')
#print(getPageHtml("http://tieba.baidu.com/p/3600458679"))
'''<li class="l_reply_num" style="margin-left:8px" >1193回复贴,共26页</li>'''
def getPagenum(text):
#从源码文件中获取,总页数
pattern = r'(\d{0,3})'
return re.findall(pattern,text)0]
#text = getPageHtml("http://tieba.baidu.com/p/3600458679")
#print(getPagenum(text))
'''http://tieba.baidu.com/p/3600458679?pn=2'''
def getPageEMail(count):
#对所有页数的文件挨个进行爬取,并利用正则表达式从源码中,匹配到信息
mails = ]
for i in range(int(count)):
url = "http://tieba.baidu.com/p/2314539885?pn=%d" %(i+1)
text = getPageHtml(url)
#<li class="d_name" data-field='{"user_id":1159023837}'>
#811393332@qq.com
pattern1 = r'\d{5,12}@qq\.com'
print("正在爬取http://tieba.baidu.com/p/3600458679?pn=%d的内容" %(i+1))
print(re.findall(pattern1,text))
mails.append(re.findall(pattern1,text))
return mails
def main():
text = getPageHtml("http://tieba.baidu.com/p/2314539885")
count = getPagenum(text)
email = getPageEMail(count)
#chain 方法是对不同集合中的元素进行操作时,将不同的列表连接起来
with open("mails.txt",'w') as f:
for i in chain(*email):
f.write(i+"\n")
main()