Source code for eWRT.input.corpus.bbc


#!/usr/bin/env python

""" 
    @package DetectLanguage.reuters
    a generic method to retrieve text from the reuters corpus
"""

# -----------------------------------------------------------------------
# - (C)opyright 2009 by Albert Weichselbraun <albert@weichselbraun.net>
# -                    webLyzard technology gmbh <awe@weblyzard.com>
# -----------------------------------------------------------------------

__revision__ = "$Revision: 545 $"
__author__   = "Albert Weichselbraun"

from eWRT.config import BBC_CORPUS_LOW
from eWRT.input.conv.html import HtmlToText
from glob import glob
import os

[docs]class BBCGetCorpus(object):
    """ An iterator over all documents """

    def __init__(self, filePattern="*"):
        """ @param[in] filePattern Pattern of files to consider (e.g. 7[3456789]*.stm)
        """
        self.files = glob( os.path.join(BBC_CORPUS_LOW, filePattern) )

    def __iter__(self):
        return self

[docs]    def next(self):
        if self.files:
            htmlTxt = open( self.files.pop() ).read()
            return HtmlToText.getText( htmlTxt )
        else:
            raise StopIteration

    @staticmethod
[docs]    def getTitle(text):
        """ returns the title of a given text """
        return text.split("\n")[2].strip()


if __name__ == '__main__':
    n = lambda x: x.replace("'", "''")
    for num, text in enumerate( BBCGetCorpus( "7[3456789]*.stm") ):
        title = BBCGetCorpus.getTitle(text)
        print "INSERT INTO evaluation_documents (content_id, title, content) VALUES ('%d', '%s', '%s');" % (num, n(title), n(text))