python抓取电影天堂电影信息的代码

python2.7mac os

抓取的是电影天堂里面最新电影的页面。链接地址: http://www.dytt8.net/html/gndy/dyzz/index.html

获取页面的中电影详情页链接

import urllib2
import os
import re
import string
# 电影url集合
movieurls = []
# 获取电影列表
def querymovielist():
url = ‘http://www.dytt8.net/html/gndy/dyzz/index.html’
conent = urllib2.urlopen(url)
conent = conent.read()
conent = conent.decode(‘gb2312′,’ignore’).encode(‘utf-8′,’ignore’)
pattern = re.compile (‘

.*?>

‘+
‘(.*?) ‘,re.s)
items = re.findall(pattern,conent)
str = ”.join(items)
pattern = re.compile (‘(.*?).*?(.*?)’,re.s)
news = re.findall(pattern, str)
for j in news:
movieurls.append(‘http://www.dytt8.net’+j[0])

抓取详情页中的电影数据

def querymovieinfo(movieurls):
for index, item in enumerate(movieurls):
print(‘电影url: ‘ + item)
conent = urllib2.urlopen(item)
conent = conent.read()
conent = conent.decode(‘gb2312′,’ignore’).encode(‘utf-8′,’ignore’)
moviename = re.findall(r’

(.*?)

‘, conent, re.s)
if (len(moviename) > 0):
moviename = moviename[0] + “”
# 截取名称
moviename = moviename[moviename.find(“《”) + 3:moviename.find(“》”)]
else:
moviename = “”
print(“电影名称: ” + moviename.strip())
moviecontent = re.findall(r’

(.*?)’,conent , re.s)
pattern = re.compile(‘(.*?)’, re.s)
moviedate = re.findall(pattern,moviecontent[0])
if (len(moviedate) > 0):
moviedate = moviedate[0].strip() + ”
else:
moviedate = “”
print(“电影发布时间: ” + moviedate[-10:])
pattern = re.compile(‘(.*?)

Posted in 未分类