python实现登录人人网并抓取新鲜事的方法

本文实例讲述了python实现登录人人网并抓取新鲜事的方法。分享给大家供大家参考。具体如下:

这里演示了python登录人人网并抓取新鲜事的方法(抓取后的排版不太美观~~)

from sgmllib import sgmlparser
import sys,urllib2,urllib,cookielib
class spider(sgmlparser):
def __init__(self,email,password):
sgmlparser.__init__(self)
self.h3=false
self.h3_is_ready=false
self.p=false
self.h3_and_p=false
self.a=false
self.depth=0
self.names=””
self.dic={}
self.email=email
self.password=password
self.domain=’renren.com’
try:
cookie=cookielib.cookiejar()
cookieproc=urllib2.httpcookieprocessor(cookie)
except:
raise
else:
opener=urllib2.build_opener(cookieproc)
urllib2.install_opener(opener)
def login(self):
url=’http://www.renren.com/plogin.do’
postdata={
’email’:self.email,
‘password’:self.password,
‘domain’:self.domain
}
req=urllib2.request(
url,
urllib.urlencode(postdata)
)
self.file=urllib2.urlopen(req).read()
#print self.file
def start_h3(self,attrs):
self.h3 = true
def end_h3(self):
self.h3=false
self.h3_is_ready=true
def start_a(self,attrs):
if self.h3 or self.p:
self.a=true
def end_a(self):
self.a=false
def start_p(self,attrs):
if self.h3_is_ready == false:
return
if self.p==true:
self.depth += 1
for k,v in attrs:
if k == ‘class’ and v == ‘content’:
self.p=true;
self.h3_and_p=true #h3 and p is connected
def end_p(self):
if self.depth == 0:
self.p=false
self.h3_and_p=false
self.h3_is_ready=false
self.names=””
if self.p == true:
self.depth-=1
def handle_data(self,text):
#record the name
if self.h3 and self.a:
self.names+=text
#record says
if self.h3 and (self.a==false):
if not text:pass
else: self.dic.setdefault(self.names,[]).append(text)
return
if self.h3_and_p:
self.dic.setdefault(self.names,[]).append(text)
def show(self):
type = sys.getfilesystemencoding()
for key in self.dic:
print ( (”.join(key)).replace(‘ ‘,”)).decode(‘utf-8’).encode(type), \
( (”.join(self.dic[key])).replace(‘ ‘,”)).decode(‘utf-8’).encode(type)
renrenspider=spider(‘your email’,’your password’)
renrenspider.login()
renrenspider.feed(renrenspider.file)
renrenspider.show()

希望本文所述对大家的python程序设计有所帮助。

Posted in 未分类