#!/usr/bin/env python import bz2 import os import sys import xml.sax class WikipediaTitleHandler(xml.sax.ContentHandler): def startElement(self, name, attrs): self.chars = [] self.tag = name def characters(self, content): if self.tag == 'title': self.chars.append(content) def endElement(self, name): if self.tag == 'title': title = ''.join(self.chars) if title.startswith('Talk:'): return if title.startswith('User talk:'): return if title.startswith('Wikipedia:'): return if title.startswith('Wikipedia talk:'): return if title.startswith('User:'): return print title.encode('utf8') def process_xml(input): parser = xml.sax.make_parser() parser.setContentHandler(WikipediaTitleHandler()) parser.parse(input) if __name__ == '__main__': input = bz2.BZ2File('/dev/fd/0') process_xml(input) # dirname = '/srv/slapgrid/slappart20/srv/runner/instance/slappart0/software_release/raw-data/' # filenames = os.listdir(dirname) # # ['enwiki-20140203-pages-meta-current1.xml-p000000010p000010000.bz2'] # for fname in filenames: # process_xml(os.path.join(dirname, fname)) # input = bz2.BZ2File(process_xml(os.path.join(dirname, fname)))