抓取糗事百科的爬虫程序
先贴上代码,等假期回家了把过程写一写
# -*- coding:utf-8 -*-
import re
import urllib2
page = 1
url = 'http://www.qiushibaike.com/hot/page/' + str(page)
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
# print response.read()
except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason
# pattern = re.compile('<div class="author".*?>.*?<a.*?>.*?<img.*?/>(.*?)</a>.*?</div>',re.S) # 该表达式可以匹配出作者
pattern = re.compile('<div class="author".*?>.*?<a.*?>.*?<img.*?/>(.*?)</a>.*?</div>.*?<div class="content">(.*?)<!--(.*?)-->.*?</div>',re.S)
content = response.read().decode('utf-8')
items = re.findall(pattern,content)
for i in items:
print '<<<'+'-'*60+'>>>'
print 'author:'+ i[0].strip()
print 'content:'+ i[1].strip()
print 'time:'+ i[2].strip()
print '\n'
版权声明:本文为GYZhe原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。