#!/usr/bin/python #coding=utf-8 """ bus2wp.py copyright (c) ant21(libsoft@gmail.com) This is a free software. It's destributed under the terms of GPL. Convert xml file exported by blogbus to wordpress extended rss file. You could import the converted file in wordpress with all posts comments and categories. This program can work with blogbus exported xml SchemaVersion "1.0-b" and "1.1". It can only run with Python2. """ import re, sys, getopt, datetime from xml.dom import minidom def usage(): print """ Usage: bus2wp.py [options] inputFile outputFile -h --help Show help message. -o --order Output order of your blog items. Order is 'asc' or 'desc'. -c --commentid Start index of your comment id. Default value is 1. Use this option to specify comment id if you have already WordPress posts and comments existed. -v --version Display version info. eg. # convert blogbus xml file to wordpress WXR file. python bus2wp.py bus.xml wp.xml # specify converted items with DESC order. python bus2wp.py -o desc bus.xml wp.xml # specify comment id start with 917. python bus2wp.py -c 917 bus.xml wp.xml """ sys.exit(0) busversion = '' def convert(inputFileName='bus.xml', outputFileName='wp.xml', order='asc'): """""" global busversion try: xmldoc = minidom.parse(inputFileName) except Exception, e: print 'Failed.' print e if '(invalid' and 'token):' in e.message.split(): print 'Please repair or delete invalid token like "& < >" there.' sys.exit(1) bus = xmldoc.documentElement if busversion == '': busversion = bus.getAttribute('SchemaVersion') logs = bus.getElementsByTagName('Log') impl = minidom.getDOMImplementation() dom = impl.createDocument(None, 'rss', None) dom.firstChild.setAttribute('version', '2.0') dom.firstChild.setAttribute('xmlns:excerpt', 'http://wordpress.org/export/1.0/excerpt/') dom.firstChild.setAttribute('xmlns:content', 'http://purl.org/rss/1.0/modules/content/') dom.firstChild.setAttribute('xmlns:wfw', 'http://wellformedweb.org/CommentAPI/') dom.firstChild.setAttribute('xmlns:dc', 'http://purl.org/dc/elements/1.1/') dom.firstChild.setAttribute('xmlns:wp', 'http://wordpress.org/export/1.0/') channel = dom.createElement('channel') root = dom.documentElement root.appendChild(channel) # create a list to contain items instead of appending them to # channel directly in order to sort them of lately according to order. if order == 'desc': item_list = [] else: item_list = None print idx = 0 for log in logs: title = log.getElementsByTagName('Title')[0] title_text = getElementData(title) content = log.getElementsByTagName('Content')[0] content_text = getElementData(content) logdate = log.getElementsByTagName('LogDate')[0] pubdate = getElementData(logdate) writer = log.getElementsByTagName('Writer')[0] creator = getElementData(writer) # blogbus only support one category per post category = log.getElementsByTagName('Sort')[0] category_text = getElementData(category) tags = log.getElementsByTagName('Tags')[0] if len(getElementData(tags).strip()) != 0: tag_list = getElementData(tags).split(' ') else: tag_list = None comments = log.getElementsByTagName('Comment') # create item element item = dom.createElement('item') # handle title title_element = createElement(dom, 'title', title_text) item.appendChild(title_element) # handle pubdate pubdate_element = createElement(dom, 'pubDate', convertPubDate(pubdate)) item.appendChild(pubdate_element) # handle creator creator_element = createElement(dom, 'dc:creator', creator) item.appendChild(creator_element) # handle category category_element = createElement(dom, 'category', category_text, 'cdata') item.appendChild(category_element) category_element2 = createElement(dom, 'category', category_text, 'cdata') category_element2.setAttribute('domain', 'category') category_element2.setAttribute('nicename', category_text) item.appendChild(category_element2) # handle tags if tag_list: for tag in tag_list: tag_element = createElement(dom, 'category', tag, 'cdata') tag_element.setAttribute('domain', 'tag') item.appendChild(tag_element) tag_element2 = createElement(dom, 'category', tag, 'cdata') tag_element2.setAttribute('domain', 'tag') tag_element2.setAttribute('nicename', tag) item.appendChild(tag_element2) # handle content content_element = createElement(dom, "content:encoded", content_text, 'cdata') item.appendChild(content_element) # handle post_date post_date_element = createElement(dom, "wp:post_date", pubdate) item.appendChild(post_date_element) # handle status status_element = createElement(dom, "wp:status", 'publish') item.appendChild(status_element) # handle comments if comments: commentElements = createComments(dom, comments) for commentElement in commentElements: item.appendChild(commentElement) if item_list != None: item_list.append(item) else: channel.appendChild(item) idx += 1 per = idx/float(len(logs)) * 100 progressStr = '\r[%.2f%%] Total Posts %d Converted %d' % (per, len(logs), idx,) sys.stdout.write(progressStr) sys.stdout.flush() if item_list: item_list.reverse() for m in item_list: channel.appendChild(m) writeDomToFile(dom, outputFileName) def getElementData(element): """""" data = '' for node in element.childNodes: if node.nodeType in (node.TEXT_NODE, node.CDATA_SECTION_NODE): data += node.data return data def createComments(dom, comments): """""" l = [] for comment in comments: email = comment.getElementsByTagName('Email')[0] homepage = comment.getElementsByTagName('HomePage')[0] #If blogbus SchemaVersion is "1.0-b", there has "PostIP" tag. if busversion == '1.0-b': try: ip = comment.getElementsByTagName('PostIP')[0] except: ip = None name = comment.getElementsByTagName('NiceName')[0] content = comment.getElementsByTagName('CommentText')[0] date = comment.getElementsByTagName('CreateTime')[0] #For blogbus SchemaVersion="1.0-b" if busversion == '1.0-b': comment_element = createCommentElement(dom, email, homepage, name, content, date, ip) #For blogbus SchemaVersion="1.1", there seems no "PostIP" tag. else: comment_element = createCommentElement(dom, email, homepage, name, content, date) l.append(comment_element) return l def createCommentElement(dom, email, homepage, name, content, date, ip=None): """""" comment_id = str(commentID.next()) comment_author = getElementData(name) comment_author_email = getElementData(email) comment_author_url = getElementData(homepage) if ip: comment_author_ip = getElementData(ip) comment_date = getElementData(date) comment_content = getElementData(content) # for WP 2.9.1 there is comment_id element comment_id_element = createElement(dom, 'wp:comment_id', comment_id) comment_author_element = createElement(dom, 'wp:comment_author', comment_author) comment_author_email_element = createElement(dom, 'wp:comment_author_email', comment_author_email) comment_author_url_element = createElement(dom, 'wp:comment_author_url', comment_author_url) if ip: comment_author_ip_element = createElement(dom, 'wp:comment_author_IP', comment_author_ip) comment_date_element = createElement(dom, 'wp:comment_date', comment_date) comment_date_gmt_element = createElement(dom, 'wp:comment_date_gmt', comment_date) comment_content_element = createElement(dom, 'wp:comment_content', comment_content, 'cdata') comment_approved_element = createElement(dom, 'wp:comment_approved', '1') # make the comment element comment_element = dom.createElement('wp:comment') # add elements to comment comment_element.appendChild(comment_id_element) comment_element.appendChild(comment_author_element) # validate email url and ip validEmail = validateEmail(comment_author_email) if (validEmail): comment_element.appendChild(comment_author_email_element) validUrl = validateUrl(comment_author_url) if (validUrl): comment_element.appendChild(comment_author_url_element) if ip: validIP = validateIP(comment_author_ip) if (validIP): comment_element.appendChild(comment_author_ip_element) comment_element.appendChild(comment_date_element) comment_element.appendChild(comment_date_gmt_element) comment_element.appendChild(comment_content_element) comment_element.appendChild(comment_approved_element) return comment_element def createElement(dom, elementName, elementValue, type='text'): """""" tag = dom.createElement(elementName) if elementValue.find(']]>') > -1: type = 'text' if type == 'text': elementValue = elementValue.replace('&', '&') elementValue = elementValue.replace('<', '<') elementValue = elementValue.replace('>', '>') elementValue = elementValue.replace('\'', ''') elementValue = elementValue.replace('"', '"') text = dom.createTextNode(elementValue) elif type == 'cdata': text = dom.createCDATASection(elementValue) tag.appendChild(text) return tag def counter(i=1): while True: val = (yield i) if val is not None: i = val else: i += 1 commentID = counter() def convertPubDate(date, timediff='+0000'): """ convert 2003-08-22 16:01:56 to Thu, 23 Aug 2007 05:47:54 +0000 """ year, mon, day = int(date[:4]), int(date[5:7]), int(date[8:10]) time = date[11:] aday = datetime.datetime(year, mon, day) d = {'1':'Mon', '2':'Tue', '3':'Wed', '4':'Thu', '5':'Fri', '6':'Sat', '7':'Sun'} m = {'1':'Jan', '2':'Feb', '3':'Mar', '4':'Apr', '5':'May', '6':'Jun', '7':'Jul', '8':'Aug', '9':'Sep', '10':'Oct', '11':'Nov', '12':'Dec'} weekday = d[str(aday.isoweekday())] month = m[str(mon)] pubdate = "%s, %d %s %s %s %s" % (weekday, day, month, year, time, timediff) return pubdate def validateIP(ip): #[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3} pattern = r'^([01]?\d\d?|2[0-4]\d|25[0-5])(\.([01]?\d\d?|2[0-4]\d|25[0-5])){3}$' p = re.compile(pattern) m = p.match(ip) if m: return True else: return False def validateEmail(email): pattern = r'^[0-9a-z][_.0-9a-z-]{0,31}@([0-9a-z][0-9a-z-]{0,30}[0-9a-z]\.){1,4}[a-z]{2,4}$' p = re.compile(pattern) m = p.match(email) if m: return True else: return False def validateUrl(url): pattern = r'^[a-zA-z]+://(\w+(-\w+)*)(\.(\w+(-\w+)*))*(\?\S*)?$' p = re.compile(pattern) m = p.match(url) if m: return True else: return False def makeIndent(dom, node, indent = 0): # Copy child list because it will change soon children = node.childNodes[:] # Main node doesn't need to be indented if indent: text = dom.createTextNode('\n' + '\t' * indent) node.parentNode.insertBefore(text, node) if children: # Append newline after last child, except for text nodes if children[-1].nodeType == node.ELEMENT_NODE: text = dom.createTextNode('\n' + '\t' * indent) node.appendChild(text) # Indent children which are elements for n in children: if n.nodeType == node.ELEMENT_NODE: makeIndent(dom, n, indent + 1) def writeDomToFile(dom, filename): domcopy = dom.cloneNode(True) makeIndent(domcopy, domcopy.documentElement) f = file(filename, 'wb') import codecs writer = codecs.lookup('utf-8')[3](f) domcopy.writexml(writer, encoding = 'utf-8') domcopy.unlink() writer.close() def main(argv=None): if argv is None: argv = sys.argv # parse command line options try: opts, args = getopt.getopt(sys.argv[1:], "o:c:hv", ["order", "commentid", "help", "version"]) except getopt.error, msg: print msg print "for help use --help" sys.exit(2) # process options order = None for o, a in opts: if o in ("-o", "--order"): if a.lower() == 'asc' or a.lower() == 'desc': order = a else: usage() elif o in ("-c", "--commentid"): if a.isdigit(): commentID.next() commentID.send(int(a)-1) else: print 'Comment id should be integer.' sys.exit(2) elif o in ("-h", "--help"): usage() elif o in ("-v", "--version"): print 'bus2wp.py version 0.11.0626' sys.exit(0) # process arguments if (len(args) == 2): print 'Converting...', start = datetime.datetime.now() convert(args[0], args[1], order) end = datetime.datetime.now() print print 'Done. Elapse %d seconds.' % (end - start).seconds else: usage() if __name__ == "__main__": sys.exit(main())