#!/usr/bin/python #coding=utf-8 """ bus2wp.py copyright (c) ant21(libsoft@gmail.com) add tags/category support. daniel(danielmethod@gmail.com) This is a free software. It's destributed under the terms of GPL. Convert xml file exported by blogbus to wordpress extended rss file. You could import the converted file in wordpress with all posts comments and categories. This program can work with blogbus exported xml SchemaVersion "1.0-b" and "1.1". Usage: bus2wp.py [options] inputFile outputFile -h --help Show help message. -o --order Output order of your blog items. Order is 'asc' or 'desc'. -v --version Display version info. eg. python bus2wp.py bus.xml wp.xml python bus2wp.py -o desc bus.xml wp.xml """ import re, sys, getopt, datetime from xml.dom import minidom busversion = '' def convert(inputFileName='bus.xml', outputFileName='wp.xml', order='asc'): """""" global busversion try: xmldoc = minidom.parse(inputFileName) except Exception, e: print 'Fail.' print e print 'Please repair or delete invalid token like "& < >" there.' sys.exit(1) #bus = xmldoc.childNodes[1] bus = xmldoc.documentElement if busversion == '': busversion = bus.getAttribute('SchemaVersion') logs = bus.getElementsByTagName('Log') impl = minidom.getDOMImplementation() dom = impl.createDocument(None, 'rss', None) dom.firstChild.setAttribute('version', '2.0') dom.firstChild.setAttribute('xmlns:content', 'http://purl.org/rss/1.0/modules/content/') dom.firstChild.setAttribute('xmlns:wfw', 'http://wellformedweb.org/CommentAPI/') dom.firstChild.setAttribute('xmlns:dc', 'http://purl.org/dc/elements/1.1/') dom.firstChild.setAttribute('xmlns:wp', 'http://wordpress.org/export/1.0/') channel = dom.createElement('channel') root = dom.documentElement root.appendChild(channel) # create a list to contain items instead of appending them to # channel directly in order to sort them of lately according to order. if order == 'desc': item_list = [] else: item_list = None for log in logs: title = log.getElementsByTagName('Title')[0] title_text = getElementData(title) content = log.getElementsByTagName('Content')[0] content_text = getElementData(content) logdate = log.getElementsByTagName('LogDate')[0] pubdate = getElementData(logdate) writer = log.getElementsByTagName('Writer')[0] creator = getElementData(writer) category = getElementData(log.getElementsByTagName('Sort')[0]) tagi = log.getElementsByTagName('Tags')[0] tags = getElementData(tagi).split(' ') comments = log.getElementsByTagName('Comment') #title_node = dom.createTextNode(title_text) #content_node = dom.createCDATASection(content_text) #pubdate_node = dom.createTextNode(convertPubDate(pubdate)) #creator_node = dom.createTextNode(creator) #----- item = dom.createElement('item') # handle title title_element = createElement(dom, 'title', title_text) item.appendChild(title_element) # handle pubdate pubdate_element = createElement(dom, 'pubDate', convertPubDate(pubdate)) item.appendChild(pubdate_element) # handle creator creator_element = createElement(dom, 'dc:creator', creator) item.appendChild(creator_element) # handle categories category_element = createElement(dom, 'category', category, 'cdata') item.appendChild(category_element) # handle categories with domain category_element = createElement(dom, 'category', category, 'cdata') category_element.setAttribute('domain','category') item.appendChild(category_element) # handle tags for tag in tags: category_element = createElement(dom, 'category', tag, 'cdata') category_element.setAttribute('domain','tag') item.appendChild(category_element) # handle content content_element = createElement(dom, "content:encoded", content_text, 'cdata') item.appendChild(content_element) # handle post_date post_date_element = createElement(dom, "wp:post_date", pubdate) item.appendChild(post_date_element) # handle status status_element = createElement(dom, "wp:status", 'publish') item.appendChild(status_element) # handle comments if comments: commentElements = createComments(dom, comments) for commentElement in commentElements: item.appendChild(commentElement) if item_list != None: item_list.append(item) else: channel.appendChild(item) if item_list: item_list.reverse() for m in item_list: channel.appendChild(m) writeDomToFile(dom, outputFileName) def getElementData(element): """""" data = '' for node in element.childNodes: if node.nodeType in (node.TEXT_NODE, node.CDATA_SECTION_NODE): data += node.data return data def createComments(dom, comments): """""" global busversion id = 0 l = [] for comment in comments: email = comment.getElementsByTagName('Email')[0] homepage = comment.getElementsByTagName('HomePage')[0] #If blogbus SchemaVersion is "1.0-b", there has "PostIP" tag. if busversion == '1.0-b': try: ip = comment.getElementsByTagName('PostIP')[0] except: ip = None name = comment.getElementsByTagName('NiceName')[0] content = comment.getElementsByTagName('CommentText')[0] date = comment.getElementsByTagName('CreateTime')[0] #For blogbus SchemaVersion="1.0-b" if busversion == '1.0-b': comment_element = createCommentElement(dom, email, homepage, name, content, date, ip) #For blogbus SchemaVersion="1.1", seems there is no "PostIP" tag. else: comment_element = createCommentElement(dom, email, homepage, name, content, date) l.append(comment_element) return l def createCommentElement(dom, email, homepage, name, content, date, ip=None): """""" comment_author = getElementData(name) comment_author_email = getElementData(email) comment_author_url = getElementData(homepage) if ip: comment_author_ip = getElementData(ip) comment_date = getElementData(date) comment_content = getElementData(content) comment_author_element = createElement(dom, 'wp:comment_author', comment_author) comment_author_email_element = createElement(dom, 'wp:comment_author_email', comment_author_email) comment_author_url_element = createElement(dom, 'wp:comment_author_url', comment_author_url) if ip: comment_author_ip_element = createElement(dom, 'wp:comment_author_IP', comment_author_ip) comment_date_element = createElement(dom, 'wp:comment_date', comment_date) comment_date_gmt_element = createElement(dom, 'wp:comment_date_gmt', comment_date) comment_content_element = createElement(dom, 'wp:comment_content', comment_content, 'cdata') comment_approved_element = createElement(dom, 'wp:comment_approved', '1') # make the comment element comment_element = dom.createElement('wp:comment') comment_element.appendChild(comment_author_element) # validate email url and ip validEmail = validateEmail(comment_author_email) if (validEmail): comment_element.appendChild(comment_author_email_element) validUrl = validateUrl(comment_author_url) if (validUrl): comment_element.appendChild(comment_author_url_element) if ip: validIP = validateIP(comment_author_ip) if (validIP): comment_element.appendChild(comment_author_ip_element) comment_element.appendChild(comment_date_element) comment_element.appendChild(comment_date_gmt_element) comment_element.appendChild(comment_content_element) comment_element.appendChild(comment_approved_element) return comment_element def createElement_1(dom, elementName, elementTextValue): """""" aNode = dom.createTextNode(elementTextValue) aElement = dom.createElement(elementName) aElement.appendChild(aNode) return aElement def createElement(dom, elementName, elementValue, type='text'): """""" tag = dom.createElement(elementName) if elementValue.find(']]>') > -1: type = 'text' if type == 'text': elementValue = elementValue.replace('&', '&') elementValue = elementValue.replace('<', '<') elementValue = elementValue.replace('>', '>') elementValue = elementValue.replace('\'', ''') elementValue = elementValue.replace('"', '"') #elementValue = elementValue.replace('&', '&') #elementValue = elementValue.replace('<', '<') #elementValue = elementValue.replace('>', '>') #elementValue = elementValue.replace(''', '\'') #elementValue = elementValue.replace('"', '"') text = dom.createTextNode(elementValue) elif type == 'cdata': text = dom.createCDATASection(elementValue) tag.appendChild(text) return tag def convertPubDate(date, timediff='+0000'): """ convert 2003-08-22 16:01:56 to Thu, 23 Aug 2007 05:47:54 +0000 """ year, mon, day = int(date[:4]), int(date[5:7]), int(date[8:10]) time = date[11:] aday = datetime.datetime(year, mon, day) d = {'1':'Mon', '2':'Tus', '3':'Wen', '4':'Thur', '5':'Fri', '6':'Sat', '7':'Sun'} m = {'1':'Jan', '2':'Feb', '3':'Mar', '4':'Apr', '5':'May', '6':'Jun', '7':'Jul', '8':'Aug', '9':'Sep', '10':'Oct', '11':'Nov', '12':'Dec'} weekday = d[str(aday.isoweekday())] month = m[str(mon)] pubdate = "%s, %d %s %s %s %s" % (weekday, day, month, year, time, timediff) return pubdate def validateIP(ip): """ """ #[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3} pattern = r'^([01]?\d\d?|2[0-4]\d|25[0-5])(\.([01]?\d\d?|2[0-4]\d|25[0-5])){3}$' p = re.compile(pattern) m = p.match(ip) if m: return True else: return False def validateEmail(email): ''' ''' pattern = r'^[0-9a-z][_.0-9a-z-]{0,31}@([0-9a-z][0-9a-z-]{0,30}[0-9a-z]\.){1,4}[a-z]{2,4}$' p = re.compile(pattern) m = p.match(email) if m: return True else: return False def validateUrl(url): ''' ''' pattern = r'^[a-zA-z]+://(\w+(-\w+)*)(\.(\w+(-\w+)*))*(\?\S*)?$' p = re.compile(pattern) m = p.match(url) if m: return True else: return False def makeIndent(dom, node, indent = 0): # Copy child list because it will change soon children = node.childNodes[:] # Main node doesn't need to be indented if indent: text = dom.createTextNode('\n' + '\t' * indent) node.parentNode.insertBefore(text, node) if children: # Append newline after last child, except for text nodes if children[-1].nodeType == node.ELEMENT_NODE: text = dom.createTextNode('\n' + '\t' * indent) node.appendChild(text) # Indent children which are elements for n in children: if n.nodeType == node.ELEMENT_NODE: makeIndent(dom, n, indent + 1) def writeDomToFile(dom, filename): domcopy = dom.cloneNode(True) makeIndent(domcopy, domcopy.documentElement) f = file(filename, 'wb') import codecs writer = codecs.lookup('utf-8')[3](f) domcopy.writexml(writer, encoding = 'utf-8') domcopy.unlink() writer.close() def main(argv=None): global busversion if argv is None: argv = sys.argv # parse command line options try: opts, args = getopt.getopt(sys.argv[1:], "o:hv", ["order", "help", "version"]) except getopt.error, msg: print msg print "for help use --help" sys.exit(2) # process options order = None for o, a in opts: if o in ("-o", "--order"): if a.lower() == 'asc' or a.lower() == 'desc': order = a else: print __doc__ sys.exit(0) elif o in ("-h", "--help"): print __doc__ sys.exit(0) elif o in ("-v", "--version"): print 'bus2wp.py version 0.9.0101' sys.exit(0) # process arguments if (len(args) == 2): print 'Converting...', sys.stdout.flush() start = datetime.datetime.now() convert(args[0], args[1], order) end = datetime.datetime.now() print 'Done. Elapse %d seconds.' % (end - start).seconds if __name__ == "__main__": sys.exit(main())