视频1 视频21 视频41 视频61 视频文章1 视频文章21 视频文章41 视频文章61 推荐1 推荐3 推荐5 推荐7 推荐9 推荐11 推荐13 推荐15 推荐17 推荐19 推荐21 推荐23 推荐25 推荐27 推荐29 推荐31 推荐33 推荐35 推荐37 推荐39 推荐41 推荐43 推荐45 推荐47 推荐49 关键词1 关键词101 关键词201 关键词301 关键词401 关键词501 关键词601 关键词701 关键词801 关键词901 关键词1001 关键词1101 关键词1201 关键词1301 关键词1401 关键词1501 关键词1601 关键词1701 关键词1801 关键词1901 视频扩展1 视频扩展6 视频扩展11 视频扩展16 文章1 文章201 文章401 文章601 文章801 文章1001 资讯1 资讯501 资讯1001 资讯1501 标签1 标签501 标签1001 关键词1 关键词501 关键词1001 关键词1501 专题2001
使用python解析xml成对应的html示例分享
2020-11-27 14:30:11 责编:小采
文档


SAX将dd.xml解析成html。当然啦,如果得到了xml对应的xsl文件可以直接用libxml2将其转换成html。

代码如下:


#!/usr/bin/env python
# -*- coding: utf-8 -*-
#---------------------------------------
# 程序:XML解析器
# 版本:01.0
# 作者:mupeng
# 日期:2013-12-18
# 语言:Python 2.7
# 功能:将xml解析成对应的html
# 注解:该程序用xml.sax模块的parse函数解析XML,并生成事件
# 继承ContentHandler并重写其事件处理函数
# Dispatcher主要用于相应标签的起始、结束事件的派发
#---------------------------------------
from xml.sax.handler import ContentHandler
from xml.sax import parse

class Dispatcher:
def dispatch(self, prefix, name, attrs=None):
mname = prefix + name.capitalize()
dname = 'default' + prefix.capitalize()
method = getattr(self, mname, None)
if callable(method): args = ()
else:
method = getattr(self, dname, None)
#args = name
#if prefix == 'start': args += attrs
if callable(method): method()

def startElement(self, name, attrs):
self.dispatch('start', name, attrs)

def endElement(self, name):
self.dispatch('end', name)

class Website(Dispatcher, ContentHandler):

def __init__(self):
self.fout = open('ddt_SAX.html', 'w')
self.imagein = False
self.desflag = False
self.item = False
self.title = ''
self.link = ''
self.guid = ''
self.url = ''
self.pubdate = ''
self.description = ''
self.temp = ''
self.prx = ''
def startChannel(self):

self.fout.write('''\n\n RSS-''')<p> def endChannel(self):<BR> self.fout.write('''<BR> <tr><td height="20"></td></tr><BR> </table><BR> </center><BR> <BR> function GetTimeDiff(str)<BR> {<BR> if(str == '')<BR> {<BR> return '';<BR> }</P> <P> var pubDate = new Date(str);<BR> var nowDate = new Date();<BR> var diffMilSeconds = nowDate.valueOf()-pubDate.valueOf();<BR> var days = diffMilSeconds/800000;<BR> days = parseInt(days);</P> <P> diffMilSeconds = diffMilSeconds-(days*800000);<BR> var hours = diffMilSeconds/3600000;<BR> hours = parseInt(hours);</P> <P> diffMilSeconds = diffMilSeconds-(hours*3600000);<BR> var minutes = diffMilSeconds/60000;<BR> minutes = parseInt(minutes);</P> <P> diffMilSeconds = diffMilSeconds-(minutes*60000);<BR> var seconds = diffMilSeconds/1000;<BR> seconds = parseInt(seconds);<p> var returnStr = "±±¾©·¢²¼Ê±¼ä£º" + pubDate.toLocaleString();</P> <P> if(days > 0)<BR> {<BR> returnStr = returnStr + " £¨¾àÀëÏÖÔÚ" + days + "Ìì" + hours + "Сʱ" + minutes + "·ÖÖÓ£©";<BR> }<BR> else if (hours > 0)<BR> {<BR> returnStr = returnStr + " £¨¾àÀëÏÖÔÚ" + hours + "Сʱ" + minutes + "·ÖÖÓ£©";<BR> }<BR> else if (minutes > 0)<BR> {<BR> returnStr = returnStr + " £¨¾àÀëÏÖÔÚ" + minutes + "·ÖÖÓ£©";<BR> }</P> <P> return returnStr;</P> <P> }</P> <P> function GetSpanText()<BR> {<BR> var pubDate;<BR> var pubDateArray;<BR> var spanArray = document.getElementsByTagName("span");</P> <P> for(var i = 0; i < spanArray.length; i++)<BR> {<BR> pubDate = spanArray[i].innerHTML;<BR> document.getElementsByTagName("span")[i].innerHTML = GetTimeDiff(pubDate); <BR> }<BR> }</P> <P> GetSpanText();<BR> </ script><BR> <div style="display:none;"><script>var _hmt = _hmt || [];(function() {var hm = document.createElement("script");hm.src = "https://hm.baidu.com/hm.js?de973f4fc4fc27c651135a4cd7781c2b";var s = document.getElementsByTagName("script")[0]; s.parentNode.insertBefore(hm, s);})();</script></div></body><BR> </html><BR> ''')<BR> self.fout.close()</P> <P> def characters(self, chars):<BR> if chars.strip():<BR> #chars = chars.strip()<BR> self.temp += chars<BR> #print self.temp<p> <BR> def startTitle(self):<p> if self.item:<BR> self.fout.write('''<BR> <tr bgcolor="#eeeeee">\n<td style="padding-top:5px;padding-left:5px;" height="30">\n<B><BR> ''')<p> def endTitle(self):<p> if not self.imagein and not self.item:<BR> self.title = self.temp<BR> self.temp = ''<BR> self.fout.write(self.title.encode('gb2312'))<p> #self.title = self.temp<BR> self.fout.write('''<BR> \n\n\n

\n
\n

function copyLink()
{
clipboardData.setData("Text",window.location.href);
alert("RSSÁ´½ÓÒѾ­¸´ÖƵ½¼ôÌù°å");
}

function subscibeLink()
{
var str = window.location.pathname;
while(str.match(/^\//))
{
str = str.replace(/^\//,"");
}
window.open("http://rss.sina.com.cn/my_sina_web_rss_news.html?url=" + str,"_self");

}
\n

\n
\n







\n
''')

if self.item:
self.title = self.temp
self.temp = ''
self.fout.write(self.title.encode('gb2312'))
self.fout.write('''


''')

def startImage(self):
self.imagein = True

def endImage(self):
self.imagein = False

def startLink(self):
if self.imagein:
self.fout.write('''

def endLink(self):
self.link = self.temp
self.temp = ''
if self.imagein:
self.fout.write(self.link.encode('gb2312'))
self.fout.write('''" target="_blank">\n ''')
elif self.item:
#self.link = self.temp
pass
else:
self.fout.write(self.link)
self.fout.write(''' " target="
_blank
"> ''')
self.fout.write(self.title.encode('gb2312'))
self.fout.write('''


''')
self.fout.write(self.description.encode('gb2312'))
self.fout.write('''
¸´ÖÆ´ËÒ³Á´½Ó ÎÒҪǶÈë¸ÃÐÂÎÅÁÐ±íµ½ÎÒµÄÒ³Ãæ£¨¼òµ¥¡¢¿ìËÙ¡¢ÊµÊ±¡¢Ãâ·Ñ£©


''')

def startUrl(self):
if self.imagein:
self.fout.write('''\n










''')

#程序入口
if __name__ == '__main__':
parse('ddt.xml', Website())

下载本文
显示全文
专题
''')
if self.item:
#self.url = self.temp
pass

def defaultStart(self):
pass
def defaultEnd(self):
self.temp = ''
def startDescription(self):
pass
def endDescription(self):
self.description = self.temp
self.temp = ''
if self.item:
#self.fout.write('¡¡¡¡')
self.fout.write(self.description.encode('gb2312'))

def endGuid(self):
self.guid = self.temp
def endPubdate(self):
if not self.temp.startswith('http'):
self.pubdate = self.temp
self.temp = ''
else:
self.pubdate = ''
def startItem(self):
self.item = True
def endItem(self):
self.item = False
self.fout.write('''


self.fout.write(self.link)
self.fout.write(''' " target="_blank"> ''')
self.fout.write(self.guid)
self.fout.write('''

''')
self.fout.write(self.pubdate)
self.fout.write('''