使用python解析xml成对应的html示例分享

sax将dd.xml解析成html。当然啦,如果得到了xml对应的xsl文件可以直接用libxml2将其转换成html。

代码如下:

#!/usr/bin/env python # -*- coding: utf-8 -*-#—————————————# 程序:xml解析器# 版本:01.0# 作者:mupeng# 日期:2013-12-18# 语言:python 2.7# 功能:将xml解析成对应的html# 注解:该程序用xml.sax模块的parse函数解析xml,并生成事件# 继承contenthandler并重写其事件处理函数# dispatcher主要用于相应标签的起始、结束事件的派发#—————————————from xml.sax.handler import contenthandlerfrom xml.sax import parse

class dispatcher: def dispatch(self, prefix, name, attrs=none): mname = prefix + name.capitalize() dname = ‘default’ + prefix.capitalize() method = getattr(self, mname, none) if callable(method): args = () else: method = getattr(self, dname, none) #args = name #if prefix == ‘start’: args += attrs if callable(method): method()

def startelement(self, name, attrs): self.dispatch(‘start’, name, attrs)

def endelement(self, name): self.dispatch(‘end’, name)

class website(dispatcher, contenthandler):

def __init__(self): self.fout = open(‘ddt_sax.html’, ‘w’) self.imagein = false self.desflag = false self.item = false self.title = ” self.link = ” self.guid = ” self.url = ” self.pubdate = ” self.description = ” self.temp = ” self.prx = ” def startchannel(self): self.fout.write(”’\n\n rss-”’) def endchannel(self): self.fout.write(”’ 《script》 function gettimediff(str) { if(str == ”) { return ”; }

var pubdate = new date(str); var nowdate = new date(); var diffmilseconds = nowdate.valueof()-pubdate.valueof(); var days = diffmilseconds/86400000; days = parseint(days);

diffmilseconds = diffmilseconds-(days*86400000); var hours = diffmilseconds/3600000; hours = parseint(hours);

diffmilseconds = diffmilseconds-(hours*3600000); var minutes = diffmilseconds/60000; minutes = parseint(minutes);

diffmilseconds = diffmilseconds-(minutes*60000); var seconds = diffmilseconds/1000; seconds = parseint(seconds); var returnstr = “±±¾©Â·¢²¼ê±¼Ã¤£º” + pubdate.tolocalestring();

if(days > 0) { returnstr = returnstr + ” £Â¨¾Ã àëïöôú” + days + “ìì” + hours + “ð¡ê±” + minutes + “·öö󣩔; } else if (hours > 0) { returnstr = returnstr + ” £Â¨¾Ã àëïöôú” + hours + “ð¡ê±” + minutes + “·öö󣩔; } else if (minutes > 0) { returnstr = returnstr + ” £Â¨¾Ã àëïöôú” + minutes + “·öö󣩔; }

return returnstr;

}

function getspantext() { var pubdate; var pubdatearray; var spanarray = document.getelementsbytagname(“span”);

for(var i = 0; i < spanarray.length; i++) { pubdate = spanarray[i].innerhtml; document.getelementsbytagname("span")[i].innerhtml = gettimediff(pubdate); } }

getspantext(); 《script》 ”’) self.fout.close()

def characters(self, chars): if chars.strip(): #chars = chars.strip() self.temp += chars #print self.temp def starttitle(self): if self.item: self.fout.write(”’ \n\n ”’) def endtitle(self): if not self.imagein and not self.item: self.title = self.temp self.temp = ” self.fout.write(self.title.encode(‘gb2312′)) #self.title = self.temp self.fout.write(”’ \n\n\n\n 《script》\n

function copylink() { clipboarddata.setdata(“text”,window.location.href); alert(“rssá´½óòñ¾­¸´Ã¶æµ½¼ôìù°å”); }

function subscibelink() { var str = window.location.pathname; while(str.match(/^\//)) { str = str.replace(/^\//,””); } window.open(“http://rss.sina.com.cn/my_sina_web_rss_news.html?url=” + str,”_self”);

} 《script》\n \n \n \n ”’) if self.item: self.title = self.temp self.temp = ” self.fout.write(self.title.encode(‘gb2312′)) self.fout.write(”’ ”’)

def startimage(self): self.imagein = true

def endimage(self): self.imagein = false def startlink(self): if self.imagein: self.fout.write(”’ def endlink(self): self.link = self.temp self.temp = ” if self.imagein: self.fout.write(self.link.encode(‘gb2312′)) self.fout.write(”'” target=”_blank”>\n ”’) elif self.item: #self.link = self.temp pass else: self.fout.write(self.link) self.fout.write(”’ ” target=” _blank “> ”’) self.fout.write(self.title.encode(‘gb2312′)) self.fout.write(”’ ”’) self.fout.write(self.description.encode(‘gb2312′)) self.fout.write(”’ ¸´Ã¶æ´ëò³á´½ó îòòªç¶èë¸ãðâîåáð±íµ½îòµÃ¤ò³ãæ£Â¨¼Ã²µ¥¡¢¿Ã¬ëù¡¢êµê±¡¢ãâ·ñ£© ”’)

def starturl(self): if self.imagein: self.fout.write(”’\n ”’) if self.item: #self.url = self.temp pass

def defaultstart(self): pass def defaultend(self): self.temp = ” def startdescription(self): pass def enddescription(self): self.description = self.temp self.temp = ” if self.item: #self.fout.write(‘¡¡¡¡’) self.fout.write(self.description.encode(‘gb2312’)) def endguid(self): self.guid = self.temp def endpubdate(self): if not self.temp.startswith(‘http’): self.pubdate = self.temp self.temp = ” else: self.pubdate = ” def startitem(self): self.item = true def enditem(self): self.item = false self.fout.write(”’ self.fout.write(self.link) self.fout.write(”’ ” target=”_blank”> ”’) self.fout.write(self.guid) self.fout.write(”’ ”’) self.fout.write(self.pubdate) self.fout.write(”’ ”’)

#程序入口if __name__ == ‘__main__’: parse(‘ddt.xml’, website())