这个版本是上个爬虫的进化版,可以爬10页热门图片下来,还有些地方没做好,有问题请留言
转载请注明,谢谢
#-*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class spider:
def getPage(self,page):
url = "http://m.qiushibaike.com/hot/page/" + page + '/'
user_agent = 'Mozilla/4.0(compatible;MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent}
req = urllib2.Request(url,headers = headers)
myResponse = urllib2.urlopen(req)
myPage = myResponse.read()
unicodePage = myPage.decode('utf-8')
soup = BeautifulSoup(unicodePage)
result = []
f = open('QB.txt','a')
f.truncate()
for content in soup.find_all("div", "content"):
result.append(content.text)
f.write('***********'+result[-1])
f.close()
def getPic(self,page):
url = "http://m.qiushibaike.com/imgrank/page/" + page + '/'
user_agent = 'Mozilla/4.0(compatible;MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent}
req = urllib2.Request(url,headers = headers)
myResponse = urllib2.urlopen(req)
myPage = myResponse.read()
unicodePage = myPage.decode('utf-8')
soup = BeautifulSoup(unicodePage)
names = []
urls = []
for tag in soup.find_all('img'):
urls.append(tag.get('src'))
names.append(tag.get('alt'))
try:
if urls[-1][:4] != 'http':
del urls[-1]
urllib.urlretrieve(urls[-1],urls[-4:]+names[-1])
except:
if len(urls) != len(names):
print u"图片数量与图片名字不对等,请调试..."
print u"出现错误,无法下载图片,请调试..."
def loadPage(self):
self.loop = True
self.page = 1
try:
while self.loop:
page = self.page
self.getPage(str(page))
print u"正在下载第%d页,请稍后..." % page
self.page += 1
if page == 10:
self.loop = False
print u"下载已完成"
break
except:
print u'无法加载页面'
def picPage(self):
self.loop = True
self.page = 1
try:
while self.loop:
page = self.page
self.getPic(str(page))
print u"正在下载第%d页的图片,请稍后..." % page
self.page += 1
if page == 10:
self.loop = False
print u"下载已完成"
break
except:
print u'无法加载页面'
def start(self):
print u"按下回车,将今日糗事百科热门前10页存入文本中,并下载今日热门图片"
raw_input('>')
self.loadPage()
self.picPage()
if __name__=='__main__' :
QB = spider()
QB.start()
版权声明:本文为chinwuforwork原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。