python实现博客文章爬虫示例

代码如下:

#!/usr/bin/python#-*-coding:utf-8-*-# jcrawler# author: jam

import timeimport urllib2from bs4 import beautifulsoup

# 目标站点targethost = “http://adirectory.blog.com”# user agentuseragent = ‘mozilla/5.0 (x11; linux x86_64) applewebkit/537.36 (khtml, like gecko) chrome/33.0.1750.117 safari/537.36’# 链接采集规则# 目录链接采集规则categoryfind = [{‘findmode’:’find’,’findtag’:’p’,’rule’:{‘id’:’cat-nav’}}, {‘findmode’:’findall’,’findtag’:’a’,’rule’:{}}]# 文章链接采集规则articlelistfind = [{‘findmode’:’find’,’findtag’:’p’,’rule’:{‘id’:’content’}}, {‘findmode’:’findall’,’findtag’:’h2′,’rule’:{‘class’:’title’}}, {‘findmode’:’findall’,’findtag’:’a’,’rule’:{}}]# 分页url规则pageurl = ‘page/#page/’pagestart = 1pagestep = 1pagestophtml = ‘404: page not found’

def gethtmltext(url): request = urllib2.request(url) request.add_header(‘accept’, “text/html,application/xhtml+xml,application/xml;q=0.9,image/webp”) request.add_header(‘accept-encoding’, “*”) request.add_header(‘user-agent’, useragent) return urllib2.urlopen(request).read()

def arrtostr(vararr): returnstr = “” for s in vararr: returnstr += str(s) return returnstr

def gethtmlfind(htmltext, findrule): findreturn = beautifulsoup(htmltext) returntext = “” for f in findrule: if returntext != “”: findreturn = beautifulsoup(returntext) if f[‘findmode’] == ‘find’: findreturn = findreturn.find(f[‘findtag’], f[‘rule’]) if f[‘findmode’] == ‘findall’: findreturn = findreturn.findall(f[‘findtag’], f[‘rule’]) returntext = arrtostr(findreturn) return findreturn

def getcategory(): categorys = []; htmltext = gethtmltext(targethost) findreturn = gethtmlfind(htmltext, categoryfind)

for tag in findreturn: print “[g]->category:” + tag.string + “|url:” + tag[‘href’] categorys.append({‘name’: tag.string, ‘url’: tag[‘href’]}) return categorys;

def getarticlelist(categoryurl): articles = [] page = pagestart #pageurl = pageurl while true: htmltext = “” pageurl = pageurl.replace(“#page”, str(page)) print “[g]->pageurl:” + categoryurl + pageurl while true: try: htmltext = gethtmltext(categoryurl + pageurl) break except urllib2.httperror,e: print “[e]->http error:” + str(e.code) if e.code == 404: htmltext = pagestophtml break if e.code == 504: print “[e]->http error 504: gateway time-out, wait” time.sleep(5) else: break

if htmltext.find(pagestophtml) >= 0: print “end page.” break else: findreturn = gethtmlfind(htmltext, articlelistfind)

for tag in findreturn: if tag.string != none and tag[‘href’].find(targethost) >= 0: print “[g]->article:” + tag.string + “|url:” + tag[‘href’] articles.append({‘name’: tag.string, ‘url’: tag[‘href’]})

page += 1 return articles;

print “[g]->getcategory”mycategorys = getcategory();print “[g]->getcategory->success.”time.sleep(3)for category in mycategorys: print “[g]->getarticlelist:” + category[‘name’] getarticlelist(category[‘url’])