# -*- coding: utf-8 -*-

from __future__ import unicode_literals
from __future__ import print_function

"""
corpusbuilder.py is part of Coquery.

Copyright (c) 2016 Gero Kunter (gero.kunter@coquery.org)

Coquery is released under the terms of the GNU General Public License (v3).
For details, see the file LICENSE that you should have received along 
with Coquery. If not, see <http://www.gnu.org/licenses/>.
"""

""" 
The module :mod:`corpusbuilder.py` provides the framework for corpus module
installers.

Different corpora may use different file formats and different file
layouts to provide the content of the corpus to the users. In order to
make this content available to Coquery, the content of the corpus files
must be processed, and stored in a database. Once this has been done,
a corpus module, containing information on the database layout, will be
written to a place where Coquery can find it. After that, the corpus 
can be queried by Coquery.

Thus, in order to use a new corpus with Coquery, a subclass of 
:class:`BaseCorpusBuilder` needs to be defined that is tailored to the
structure of that corpus. The name of this subclass has to be 
:class:`BuilderClass`. Usually, such a subclass will at least 
reimplement :func:`BaseCorpusBuilder.__init__`.. The reimplementation 
contains the specifications for the data tables such as the name and data 
type of the columns. It also specifies links between different data tables.
Please note that the reimplemented :func:`__init__`` should start with a 
call to the inherited initialization method, like so::

    super(BuilderClass, self).__init__(gui)

In addition to that, most subclasses will also reimplement either
:func:`BaseCorpusBuilder.process_file` or one of the related methods (e.g. 
:func:`BaseCorpusBuilder.process_text_file`. The reimplemented method is 
aware of the data format that is used in the corpus data files, and is 
therefore able to process the information stored in the data files. It is 
responsible for storing the information correctly in the pertaining data 
tables defined in :func:`BaseCorpusBuilder.__init__`.

Examples
--------    
For examples of reimplementations of ``BaseCorpusBuilder``, see the 
corpus installers distributed in the Coquery default installation. For 
instance, :mod:`coq_install_generic.py` is a generic installer that process 
any collection of text files in a directiory into a query-able corpus, and
:mod:`coq_install_bnc.py` contains an installer that reads and processes the 
XML version of the British National Corpus.
"""

try:
    str = unicode
except:
    pass

import getpass
import codecs
import logging
import collections
import os.path
import warnings
import time
import pandas as pd
import unicodedata
import argparse
import re
import sys
import textwrap
import fnmatch
import inspect

try:
    from lxml import etree as ET
except ImportError:
    try:
        import xml.etree.cElementTree as ET
    except ImportError:
        import xml.etree.ElementTree as ET
        
from . import sqlhelper
from . import sqlwrap
from . import options
from . import corpus

from .errors import *
from .defines import *
from .unicode import utf8

insert_cache = collections.defaultdict(list)

new_code_str = """
    @staticmethod
    def get_name():
        return "{name}"

    @staticmethod
    def get_db_name():
        return "{db_name}"
    
    @staticmethod
    def get_title():
        return "{name}: a {is_tagged_corpus}"
        
    _is_adhoc = True
        
    @staticmethod
    def get_description():
        return ["{description}"]
    """

new_doc_string = """
This corpus installer was generated by the corpus query tool Coquery
(http://www.coquery.org).
"""

class MethodNotImplementedError(Exception):
    msg = "Function not implemented."

# module_code contains the Python skeleton code that will be used to write
# the Python corpus module."""
module_code = """# -*- coding: utf-8 -*-
#
# FILENAME: {name}.py -- a corpus module for the Coquery corpus query tool
# 
# This module was automatically created by corpusbuilder.py.
#

from __future__ import unicode_literals
from coquery.corpus import *

class Resource(SQLResource):
    name = '{name}'
    display_name = '{display_name}'
    db_name = '{db_name}'
    url = '{url}'
{variables}
{resource_code}
    
class Lexicon(LexiconClass):
    '''
    Corpus-specific code
    '''
    
{lexicon_code}

class Corpus(CorpusClass):
    '''
    Corpus-specific code
    '''
    
{corpus_code}
"""

# Corpus builders should include code that determines word counts for 
# subcorpora. More specifically, they should produce a table with all
# combinations of corpus features and the associated number of words.
# For example, COCA should have a table with Genre, Year and Frequency,
# with 5 x 23 rows (5 Genres, 23 Years). 

class Column(object):
    """ Define an object that stores the description of a column in one 
    MySQL table."""
    is_identifier = False
    key = False
    
    def __init__(self, name, data_type, index_length=None):
        """ 
        Initialize the column
        
        Parameters
        ----------
        name : str 
            The name of the column 
        data_type : str 
            A MySQL data type description 
        index_length : int or None 
            The length of the index for this column. If None, the index length 
            will be determined automatically, which can take quite some time 
            for larger corpora.
        """

        self._name = name
        self._data_type = data_type
        self.index_length = index_length        
        self.unique = False
        
    def __repr__(self):
        return "Column({}, {}, {})".format(self._name, self._data_type, self.index_length)
        
    @property
    def name(self):
        return self._name
    
    @name.setter
    def name(self, new_name):
        self._name = new_name
    
    @property
    def data_type(self):
        """
        Return the data type of the column.
        
        Returns
        -------
        data_type : string
            The data type of the column in the same form as used by the 
            MySQL CREATE TABLE command.
        
        """
        return self._data_type
    
    @property
    def base_type(self):
        """
        Return the base type of the column.
        
        This function does not return the field length, but only the base 
        data type, i.e. VARCHAR, MEDIUMINT, etc.
        
        Use data_type for the full column specification.

        Returns
        -------
        base_type : string
            A MySQL base data type.

        """
        return self._data_type.split()[0].partition("(")[0]
    
    @data_type.setter
    def data_type(self, new_type):
        self._data_type = new_type
        
class Identifier(Column):
    """ Define a Column class that acts as the primary key in a table."""
    is_identifier = True

    def __init__(self, name, data_type, unique=True, index_length=None):
        super(Identifier, self).__init__(name, data_type, index_length)
        self.unique = unique

    def __repr__(self):
        return "Identifier(name='{}', data_type='{}', unique={}, index_length={})".format(self._name, self._data_type, self.unique, self.index_length)
    
    @property
    def name(self):
        return self._name
        if self.unique:
            return self._name
        else:
            return "{}_primary".format(self._name)
        
class Link(Column):
    """ Define a Column class that links a table to another table. In MySQL
    terms, this acts like a foreign key."""
    key = True
    def __init__(self, name, table_name):
        super(Link, self).__init__(name, "", True)
        self._link = table_name

    def __repr__(self):
        return "Link(name='{}', '{}', data_type='{}')".format(self._name, self._link, self._data_type)
        
class Table(object):
    """ Define a class that is used to store table definitions."""
    def __init__(self, name):
        self._name = name
        self.columns = list()
        self.primary = None
        self._current_id = 0
        self._row_order = []
        self._add_cache = dict()
        self._add_cache2 = list()
        # The defaultdict _add_lookup will store the index of rows in this 
        # table. It uses the trick described at http://ikigomu.com/?p=186
        # to achieve an O(1) lookup. When looking up a row as in 
        #
        # x = self._add_lookup[tuple([row[x] for x in self._row_order])]
        # 
        # the returned value is the length of the lookup table at the time 
        # the entry was created. In other words, this is the row id of that 
        # row.
        self._add_lookup = collections.defaultdict(lambda: len(self._add_lookup) + 1)
        self._commited = {}
        self._col_names = None
        self._engine = None
        self._max_cache = 0

    @property
    def name(self):
        return self._name
        
    @name.setter
    def name(self, s):
        self._name = s
        
    def setDB(self, db):
        self._DB = db

    def commit(self, strange_check=False):
        """
        Commit the table content to the data base.
        
        This table commits the unsaved content of the table to the data base.
        
        As this method is usually called after a file has been processed, 
        this ensures that all new table rows are commited, while at the same
        time preserving some memory space.
        """
        
        if self._add_cache2:
            df = pd.DataFrame(self._add_cache2)
            field_order = self._get_field_order()
            assert len(field_order) == len(df.columns), "Length mismatch while committing table {}.\n{}\n\n{}".format(
                self.name,
                "".join(sorted(["{:20}".format(x) for x in field_order])),
                "".join(sorted(["{:20}".format(x.name) for x in self.columns])))
            
            try:
                df.columns = self._get_field_order()
            except ValueError as e:
                raise e

            # make sure that all strings are unicode, even under 
            # Python 2.7:
            if sys.version_info < (3, 0):
                for column in df.columns[df.dtypes == object]:
                    try:
                        df[column] = df[column].apply(utf8)
                    except TypeError:
                        pass

            # apply unicode normalization:
            for column in df.columns[df.dtypes == object]:
                try:
                    df[column] = df[column].apply(lambda x: unicodedata.normalize("NFKC", x))
                except TypeError:
                    pass

            df.to_sql(self.name, self._DB.engine, if_exists="append", index=False)
            self._add_cache2 = list()

    def add(self, values):
        """ 
        Store the 'values' dictionary in the add cache of the table. If 
        necessary, a valid primary key is added to the values.
        
        """ 
        l = [values[x] for x in self._row_order]

        if not self.primary.name in self._row_order:
            self._current_id += 1
            self._add_cache2.append(tuple([self._current_id] + l))
        else:
            self._current_id = values[self.primary.name]
            self._add_cache2.append(tuple(l))

        self._add_lookup[tuple(l)] = self._current_id

        if self._max_cache and len(self._add_cache2) > self._max_cache:
            self.commit()

        return self._current_id
            
    def get_or_insert(self, values, case=False):
        """ 
        Returns the id of the first entry matching the values from the table.
        
        If there is no entry matching the values in the table, a new entry is
        added to the table based on the values. 
        description.
        
        Parameters
        ----------
        values : dict
            A dictionary with column names as keys, and the entry content
            as values.
            
        Returns
        -------
        id : int 
            The id of the entry, as it is stored in the SQL table.
        """
        key = tuple([values[x] for x in self._row_order])
        if key in self._add_lookup:
            return self._add_lookup[key]
        else:
            return self.add(values)

    def _get_field_order(self):
        if not self.primary.name in self._row_order:
            return [self.primary.name] + self._row_order
        else:
            return self._row_order

    def find(self, values):
        """ 
        Return the first row that matches the values, or None
        otherwise.
        """
        x = self._DB.find(self.name, values, [self.primary.name])
        if x:
            return x[0]
        else:
            return None
        
    def add_column(self, column):
        self.columns.append(column)
        if column.name in self._row_order:
            if not column.key:
                raise ValueError("Duplicate column: {}, {}".format(self._row_order, column.name))
            else:
                return
        if column.is_identifier:
            self.primary = column
            if not column.unique:
                self._row_order.append(column.name)
        else:
            self._row_order.append(column.name)

    def get_column(self, name):
        """
        Return the specified column.
        
        Parameters
        ----------
        name : string
            The name of the column
            
        Returns
        -------
        col : object or NoneType
            The Column object matching the name, or None.
        """
        for x in self.columns:
            if x.name == name:
                return x
        return None
            
    def get_create_string(self, db_type):
        """
        Generates the SQL command required to create the table.
        
        Parameters
        ----------
        db_type : str
            A string representing the SQL engine, either "mysql" or "sqlite"
        
        Returns
        -------
        S : str
            A string that can be sent to the SQL engine in order to create
            the table according to the specifications.
        """
        command_list = []
        str_list = []
        columns_added = set([])
        for column in self.columns:
            if column.name not in columns_added:
                if db_type == SQL_MYSQL:
                    if column.is_identifier:
                        if not column.unique:
                            # add surrogate key 
                            # do not add AUTO_INCREMENT to strings or ENUMs:
                            #str_list.insert(0, "{} INT AUTO_INCREMENT".format(column.name))
                            str_list.insert(0, "{}_primary INT AUTO_INCREMENT".format(column.name))
                            str_list.insert(1, "{} {}".format(column.name, column.data_type))
                            str_list.append("PRIMARY KEY ({}_primary)".format(column.name))
                        else:
                            # do not add AUTO_INCREMENT to strings or ENUMs:
                            if column.data_type.upper().startswith(("ENUM", "VARCHAR", "TEXT")):
                                pattern = "{} {}"
                            else:
                                pattern = "{} {} AUTO_INCREMENT"
                            pattern = "{} {}"
                            str_list.append(pattern.format(column.name, column.data_type))
                            str_list.append("PRIMARY KEY ({})".format(column.name))
                    else:
                        str_list.append("{} {}".format(
                            column.name,
                            column.data_type))
                    columns_added.add(column.name)
                elif db_type == SQL_SQLITE:
                    # replace ENUM by VARCHAR:
                    match = re.match("^\s*enum\((.+)\)(.*)$", column.data_type, re.IGNORECASE)
                    if match:
                        max_len = 0
                        for x in match.group(1).split(","):
                            max_len = max(max_len, len(x.strip(" '\"")))
                        data_type = "VARCHAR({max_len}) {spec}".format(
                            max_len=max_len, spec=match.group(2))
                    else:
                        data_type = column.data_type
                    
                    if column.is_identifier:
                        if not column.unique:
                            # add surrogate key 
                            str_list.insert(0, "{}_primary INT PRIMARY KEY".format(column.name))
                            str_list.insert(1, "{} {}".format(column.name, data_type))
                        else:
                            str_list.append("{} {} PRIMARY KEY".format(
                                column.name, data_type))
                    else:
                        str_list.append("{} {}".format(
                            column.name, data_type))
                    columns_added.add(column.name)

        if db_type == SQL_SQLITE:
            # make SQLite columns case-insensitive by default
            for i, x in enumerate(list(str_list)):
                field_type = x.split()[1]
                if "VARCHAR" in field_type.upper() or "TEXT" in field_type.upper():
                    str_list[i] = "{} COLLATE NOCASE".format(x)

        S = ", ".join(str_list)
        command_list.insert(0, S)
        table_str = "; ".join(command_list)
        if db_type == SQL_SQLITE:
            return re.sub(r"\s*UNSIGNED", "", table_str)
        else:
            return table_str
    
class BaseCorpusBuilder(corpus.BaseResource):
    """ 
    This class is the base class used to build and install a corpus for 
    Coquery. For corpora currently not supported by Coquery, new builders 
    can be developed by subclassing this class.
    """
    logger = None
    module_code = None
    name = None
    table_description = None
    
    arguments = None
    name = None
    additional_arguments = None
    parser = None
    DB = None
    additional_stages = []
    start_time = None
    file_filter = None
    encoding = "utf-8"
    expected_files = []
    # special files are expected files that will not be stored in the file 
    # table. For example, a corpus may include a file with speaker 
    # information which needs to be evaluated during installation, and which
    # therefore has to be in the expected_files list, but which does not 
    # contain token information, and therefore should not be stored as a 
    # source file.
    special_files = []
    __version__ = "1.0"
    
    _read_file_formatter = "Reading {file} (%v of %m)..."
    
    def __init__(self, gui=False):
        self.module_code = module_code
        self.table_description = {}
        self._time_features = []
        self._id_count = {}
        self._primary_keys = {}
        self._interrupted = False
        self._new_tables = {}
        
        self._corpus_buffer = None
        self._corpus_id = 0
        self._widget = gui
        self._file_list = []

        self._source_count = collections.Counter()
        
        # set up argument parser:
        self.parser = argparse.ArgumentParser()
        self.parser.add_argument("name", help="name of the corpus", type=str)
        self.parser.add_argument("path", help="location of the text files", type=str)
        self.parser.add_argument("--db_user", help="name of the MySQL user (default: coquery)", type=str, default="coquery", dest="db_user")
        self.parser.add_argument("--db_password", help="password of the MySQL user (default: coquery)", type=str, default="coquery", dest="db_password")
        self.parser.add_argument("--db_host", help="name of the MySQL server (default: localhost)", type=str, default="127.0.0.1", dest="db_host")
        self.parser.add_argument("--db_port", help="port of the MySQL server (default: 3306)", type=int, default=3306, dest="db_port")
        self.parser.add_argument("--db_name", help="name of the MySQL database to be used (default: same as 'name')", type=str)
        self.parser.add_argument("-o", help="optimize field structure (can be slow)", action="store_true")
        self.parser.add_argument("-v", help="produce verbose output", action="store_true", dest="verbose")
        self.parser.add_argument("-i", help="create indices (can be slow)", action="store_true")
        if options._use_nltk:
            self.parser.add_argument("--no-nltk", help="Do not use NLTK library for automatic part-of-speech tagging", action="store_false", dest="use_nltk")
        self.parser.add_argument("-l", help="load source files", action="store_true")
        self.parser.add_argument("-c", help="create database tables", action="store_true")
        self.parser.add_argument("-w", help="write corpus module", action="store_true")
        self.parser.add_argument("--lookup_ngram", help="create an ngram lookup table (can be very big)", action="store_true")
        self.parser.add_argument("--encoding", help="select a character encoding for the input files (e.g. latin1, default: {})".format(self.encoding), type=str, default=self.encoding)
        self.additional_arguments()
        
    def add_tag_table(self, features_only=False):
        """ 
        Create the table description for a tag table.
        
        Corpora should usually have a tag table that is used to store
        text information. This method is called during :func:`build` and
        adds a tag table if none is present yet.
        """
        
        self.tag_table = "tags"
        self.tag_id = "TagId"
        self.tag_label = "Tag"
        self.tag_type = "Type"
        self.tag_corpus_id = self.corpus_id
        self.tag_attribute = "Attribute"
        
        if not features_only:
            self.create_table_description(self.tag_table,
                [Identifier(self.tag_id, "MEDIUMINT(6) UNSIGNED NOT NULL"),
                Column(self.tag_type, "ENUM('open', 'close', 'empty')"),
                Column(self.tag_label, "TINYTEXT NOT NULL"),
                Link(self.tag_corpus_id, self.corpus_table),
                Column(self.tag_attribute, "TINYTEXT NOT NULL")])

    def interrupt(self):
        """
        Interrupt the builder.
        
        Calling this method will interrupt the current building or 
        installation process. All data written so far to the database will 
        be discarded, and no corpus module will be written.
        
        In particular, this method is called in the GUI if the Cancel button
        is pressed.
        """
        self._interrupted = True
        
    @property
    def interrupted(self):
        return self._interrupted

    def check_arguments(self):
        """ Check the command line arguments. Add defaults if necessary."""
        if not self._widget:
            self.arguments, unknown = self.parser.parse_known_args()
            if not options._use_nltk:
                self.arguments.use_nltk = False
            if not self.arguments.db_name:
                self.arguments.db_name = self.arguments.name
            self.name = self.arguments.name
            
    def additional_arguments(self):
        """ Use this function if your corpus installer requires additional
        arguments."""
        pass
    
    def commit_data(self):
        """
        Commit any corpus data that is still stored only in the internal 
        tables to the database.
        
        :func:`commit_data` is usually called for each file after the content
        has been processed. 
        
        """
        if self.interrupted:
            return

        for table in self._new_tables:
            self._new_tables[table].commit(strange_check=True)

        if self._corpus_buffer:
            df = pd.DataFrame(self._corpus_buffer)
            df.to_sql(self.corpus_table, self.DB.engine, if_exists="append", index=False)
            self._corpus_buffer = []             

    def create_table_description(self, table_name, column_list):
        """
        Create the description of a MySQL table. The MySQL table described
        in this way will be created during :func:`build` by calling 
        :func:`build_create_table`.
        
        Parameters
        ----------
        table_name : string
            The name of the MySQL table
        column_list : list
            A list of :class:`Column` instances
        """
        new_table = Table(table_name)
        for x in column_list:
            if isinstance(x, Link):
                try:
                    x.data_type = self._new_tables[x._link].primary.data_type
                except KeyError:
                    raise KeyError("Table description for '{}' contains a link to unknown table '{}'".format(table_name, x._link))
            try:
                new_table.add_column(x)
            except ValueError as e:
                print(table_name, x)
                raise e
        self._new_tables[table_name] = new_table

    def table(self, table_name):
        """
        Return a Table object matching the specified name.
        
        Parameters
        ----------
        table_name : string
            The name of the table
            
        Returns
        -------
        table : object
            A Table object, or None if there is no table of the given name
        """
        try:
            return self._new_tables[table_name]
        except KeyError:
            return None

    def setup_logger(self):
        """ 
        Initialize the logger.
        """
        class TextwrapFormatter(logging.Formatter):
            def __init__(self, fmt):
                super(TextwrapFormatter, self).__init__(fmt=fmt)
                self.wrap = textwrap.TextWrapper(width=79, subsequent_indent="        ").fill
                
            def format(self, entry):
                return "\n%s\n" % self.wrap(super(TextwrapFormatter, self).format(entry))
        
        self.logger = logging.getLogger(self.name)
        self.logger.setLevel (logging.INFO)
        log_file_name = "%s.log" % self.name
        file_handler = logging.FileHandler(log_file_name)
        file_handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)-8s %(message)s"))
        self.logger.addHandler(file_handler)
        stream_handler = logging.StreamHandler()
        stream_handler.setFormatter(TextwrapFormatter("%(levelname)s %(message)s"))
        stream_handler.setLevel(logging.WARNING)
        self.logger.addHandler(stream_handler)

    def build_create_tables(self):
        """ 
        Create the MySQL tables used by the corpus, based on the column
        information given in the table description (see
        :func:``create_table_description``).
        """
        self.add_tag_table()
        
        # initialize progress bars:
        if self._widget:
            self._widget.progressSet.emit(len(self._new_tables), "Creating tables... (%v of %m)")
            self._widget.progressUpdate.emit(0)

        for i, current_table in enumerate(self._new_tables):
            self._new_tables[current_table].setDB(self.DB)
            self.DB.create_table(
                current_table, 
                self._new_tables[current_table].get_create_string(self.arguments.db_type))
            if self._widget:
                self._widget.progressUpdate.emit(i + 1)
            if self.interrupted:
                return

    @classmethod
    def get_file_list(cls, path, file_filter, sort=True):
        """ 
        Return a list of valid file names from the given path.
        
        This method recursively searches in the directory ``path`` and its
        subdirectories for files that match the file filter specified in 
        the class attribute ``file_filter``.

        Parameters
        ----------
        path : string
            The path in which to look for files
        file_filter : string
            A filter that is applied to all file names in the path
        sort : bool 
            True if the file list should be returned in alphabetical order,
            or False otherwise.
            
        Returns
        -------
        l : list
            A list of strings, each representing a file name       
            
        """
        L = []
        expect = list(cls.expected_files)
        for source_path, folders, files in os.walk(path):
            for current_file in files:
                full_name = os.path.join(source_path, current_file)
                if cls.expected_files:
                    if current_file in expect:
                        L.append(full_name)
                        expect.remove(current_file)
                elif not file_filter or fnmatch.fnmatch(current_file, file_filter):
                    L.append(full_name)
        if sort:
            return sorted(L)
        else:
            return L
    
    def validate_path(self, path):
        """
        Validate that directory ``path`` contains corpus data files.
        
        Parameters
        ----------
        path : string
            The path to be validated
        
        Returns
        -------
        valid : bool
            True if the directory ``path`` contains valid corpus data files,
            or False otherwise.
        """
        
        # check if path exists:
        if not os.path.isdir(path):
            return False

        # check if path contains any file:
        for source_path, folders, files in os.walk(path):
            for current_file in files:
                full_name = os.path.join(source_path, current_file)
                if os.path.isfile(full_name):
                    return True
                if not self.file_filter or fnmatch.fnmatch(current_file, self.file_filter):
                    return True
        return False

    @classmethod
    def validate_files(cls, l):
        """
        Validates the file list.
        
        The default implementation will compare the content of the argument 
        to the class attribute expected_files. If there is an entry in 
        expected_files that is not also in the argument list, the file list 
        is considered to be invalid.
        
        Parameters
        ----------
        l : list
            A list of file names as created by get_file_list()
            
        """
        found_list = [x for x in [os.path.basename(y) for y in l] if x in cls.expected_files]
        if len(set(found_list)) < len(set(cls.expected_files)):
            missing_list = [x for x in cls.expected_files if x not in found_list]
            sample = "<br/>".join(missing_list[:5])
            if len(missing_list) > 6:
                sample = "{}</code>, and {} other files".format(sample, len(missing_list) - 3)
            elif len(missing_list) == 6:
                sample = "<br/>".join(missing_list[:6])
            raise RuntimeError("<p>Not all expected corpora files were found in the specified corpus data directory. Missing files are:</p><p><code>{}</code></p>".format(sample))
        
    def get_corpus_code(self):
        """ 
        Return a text string containing the Python source code for the 
        Corpus class of this module.
        
        The code is obtained from the the class attribute self._corpus_code.
        """
        try:
            lines = [x for x in inspect.getsourcelines(self._corpus_code)[0] if not x.strip().startswith("class")]
        except AttributeError:
            lines = []
        return "".join(lines)

    def map_query_item(self, item_type, rc_feature):
        """
        Maps a query item type to the given resource feature
        
        Parameters
        ----------
        item_type : str 
            One of the string constants from defines.py: 
            QUERY_ITEM_WORD, QUERY_ITEM_LEMMA, QUERY_ITEM_TRANSCRIPT,
            QUERY_ITEM_POS, QUERY_ITEM_GLOSS
            
        rc_feature : str 
            The resource feature that will be used to access the information 
            needed for the query item type specified by 'item_type'.
        """
        if item_type == QUERY_ITEM_WORD:
            self.query_item_word = rc_feature
        elif item_type == QUERY_ITEM_LEMMA:
            self.query_item_lemma = rc_feature
        elif item_type == QUERY_ITEM_TRANSCRIPT:
            self.query_item_transcript = rc_feature
        elif item_type == QUERY_ITEM_POS:
            self.query_item_pos = rc_feature
        elif item_type == QUERY_ITEM_GLOSS:
            self.query_item_gloss = rc_feature

    def add_time_feature(self, rc_feature):
        """
        Add the resource feature to the list of time features.
        
        Time features are those features that can be visualized using a
        time series visualization.        
        """
        self._time_features.append(rc_feature)
    
    def get_lexicon_code(self):
        """ return a text string containing the Python source code from
        the class attribute self._lexicon_code. This function is needed
        to add lexicon-specific code the Python corpus module."""
        try:
            lines = [x for x in inspect.getsourcelines(self._lexicon_code)[0] if not x.strip().startswith("class")]
        except AttributeError:
            lines = []
        return "".join(lines)
    
    def get_resource_code(self):
        """ return a text string containing the Python source code from
        the class attribute self._resource_code. This function is needed
        to add resource-specific code the Python corpus module."""
        try:
            lines = [x for x in inspect.getsourcelines(self._resource_code)[0] if not x.strip().startswith("class")]
        except AttributeError:
            lines = []
        lines.insert(0, "    time_features = {}".format(
            "[{}]".format(", ".join(['"{}"'.format(x) for x in self._time_features]))))
        lines.insert(0, "    number_of_tokens = {}\n".format(self._corpus_id))
        return "".join(lines)
    
    def get_method_code(self, method):
        pass

    def store_filename(self, file_name):
        """
        Store the file in the file table, but not if it is a special file 
        listed in self.special_files.
        
        Parameters
        ----------
        file_name : str 
            The path to the file
        
        Returns
        -------
        file_id : int or None
            The id of the file in the table, or None if the file is a special
            file
        """
        if file_name in self.special_files:
            return None
        self._file_name = file_name
        self._value_file_name = os.path.basename(file_name)
        self._value_file_path = os.path.split(file_name)[0]

        self._file_id = self.table(self.file_table).get_or_insert(
            {self.file_name: self._value_file_name,
             self.file_path: self._value_file_path})

    #def get_lemma(self, word):
        #""" Return a lemma for the word. By default, this is simply the
        #word in lower case, but this method can be overloaded with methods
        #that use e.g. lemma dictionaries. 
        #The method is used by the default file processing methods. If your
        #corpus implements a specific file processing method, get_lemma() may
        #be obsolete. """
        #return word.lower()
    
    #def get_lemma_id(self, word):
        #""" Return a lemma identifier for the word. If there is a separate 
        #lemma table, the identifier is an index to that table. Otherwise, 
        #the identifier is the lemma label."""
        #try:
            #return self.table_get(self.lemma_table, 
                #{self.lemma_label: self.get_lemma(word)})
        #else:
            #return self.get_lemma(word)
    
    #def get_pos(self, word):
        #""" Return the part-of-speech for the word. By default, an empty
        #string is returned, but this method may be overloaded with methods
        #that use for example a pos-tagged dictionary.
        #The method is used by the default file processing methods. If your
        #corpus implements a specific file processing method, get_lemma() may
        #be obsolete. """
        #return ""
    
    #def get_pos_id(self, word):
        #""" Return a part-of-speech identifier for the word. If there is a 
        #separate part-of-speech table, the identifier is an index to that 
        #table. Otherwise, the identifier is the part-of-speech label."""
        
        #if "pos_table" in self.table_description:
            #return self.table_get(self.pos_table, 
                #{self.pos_label: self.get_pos(word)})
        #else:
            #return self.get_pos(word)        

    #def get_transcript(self, word):
        #""" Return the phonemic transcript for the word. By default, an 
        #empty string is returned, but this method may be overloaded with 
        #methods that use for example a pronunciation dictionary.
        #The method is used by the default file processing methods. If your
        #corpus implements a specific file processing method, get_lemma() may
        #be obsolete. """
        #return ""
    
    #def get_transcript_id(self, word):
        #""" Return a transcription identifier for the word. If there is a 
        #separate transcription table, the identifier is an index to that 
        #table. Otherwise, the identifier is the transcript label."""
        
        #if "transcript_table" in self.table_description:
            #return self.table_get(self.transcript_table, 
                #{self.transcript_label: self.get_transcript(word)})
        #else:
            #return self.get_transcript(word)        

    def process_xlabel_file(self, file_name):
        """ 
        Process an xlabel file.
        
        This method reads the content of the file, and interprets it as an
        ESPS/waves+ xlabel file. Xlabel filess are used in some spoken 
        corpora to represent phonetic annotations. A description of the file format can be found here: 
        
        http://staffhome.ecm.uwa.edu.au/~00014742/research/speech/local/entropic/ESPSDoc/waves/manual/xlabel.pdf

        Basically, an xlabel file consists of a header and a file body, 
        separated by a row containing only the hash mark '#'. This method 
        ignores the data from the header. Rows in the file body consist of
        three columns ``time color word``, separated by whitespace. Rows with less than three columns are ignored.
        
        Parameters
        ----------
        file_name : string
            The path name of the file that is to be processed
        """
        file_body = False
        # read file using the specified encoding (default is 'utf-8), and 
        # retry using 'ISO-8859-1'/'latin-1' in case of an error:
        try:
            with codecs.open(file_name, "r", encoding=self.arguments.encoding) as input_file:
                input_data = input_file.read()
        except UnicodeDecodeError:
            with codecs.open(file_name, "r", encoding="ISO-8859-1") as input_file:
                input_data = input_file.read()
                
        input_data = input_data.splitlines()
        for row in input_data:
            # only process the lines after the hash mark:
            if row.strip() == "#":
                file_body = True
            elif file_body:
                try:
                    time, color, word = row.split()
                # in xlabel files, rows can sometimes contain only the time 
                # tag, but no other labels. In this case, the row is ignored:
                except ValueError:
                    continue
                
                # create a dictionary containing the word label, plus
                # additional labels if provided by the lexicon:
                word_dict = {}
                word_dict[self.word_label] = word
                #if "LEX_LEMMA" in self.lexicon_features:
                    #word_dict[self.word_lemma_id] = self.get_lemma_id(word)
                #if "LEX_POS" in self.lexicon_features:
                    #word_dict[self.word_pos] = self.get_pos_id(word)
                #if "LEX_PHON" in self.lexicon_features:
                    #word_dict[self.word_transcript_id] = self.get_transcript_id(word)

                # get a word id for the current word:
                word_id = self.table(self.word_table).get_or_insert(word_dict)
                
                # add the word as a new token to the corpus:
                
                self.add_token_to_corpus(
                    {self.corpus_word_id: word_id, 
                        self.corpus_file_id: self._file_id,
                        self.corpus_time: time})

    def process_text_file():
        raise RuntimeError
    
    def _add_next_token_to_corpus(self, values):
        self._corpus_id += 1
        values[self.corpus_id] = self._corpus_id
        self._corpus_buffer.append(values)
        
    def add_token_to_corpus(self, values):
        if len(values) < len(self._new_tables[self.corpus_table].columns) - 2:
            raise IndexError
        self._corpus_id += 1
        values[self.corpus_id] = self._corpus_id
        self._corpus_keys = values.keys()
        self._corpus_buffer = []
        self._corpus_buffer.append(values)
        self.add_token_to_corpus = self._add_next_token_to_corpus
    
    ### METHODS FOR XML FILES

    def xml_parse_file(self, file_object):
        """ Return the root of the XML parsed tree from the file object. 
        If there is a parsing error, print the surrounding environment and 
        raise an exception."""
        try:
            e = ET.parse(file_object).getroot()
        except ET.ParseError as e:
            # in case of a parsing error, print the environment that caused
            # the failure:
            m = re.search(r"line (\d*), column (\d*)", str(e))
            if m:
                line = int(m.group(1))
                column = int(m.group(2))
                start_line = max(0, line - 5)
                end_line = line + 5
            else:
                start_line = 0
                end_line = 999999
            #S = S.splitlines()
            S = []
            self.logger.error(e)
            for i, x in enumerate(S):                
                if i > start_line:
                    warnings.warn("{:<3}: {}".format(i, x.decode("utf8")))
                if i == line - 1:
                    warnings.warn("      " + " " * (column - 1) + "^")
                if i > end_line:
                    break
            raise e
        return e

    def xml_get_body(self, root):
        """ Return the XML body from the root."""
        raise MethodNotImplementedError

    def xml_get_meta_information(self, root):
        """ Retrieve and store all meta information from the root."""
        raise MethodNotImplementedError
        
    def xml_process_element(self, element):
        """ Process the XML element. Processing involves several stages:
        
        1. Call xml_preprocess_tag(element) for tag actions when entering 
        2. Call xml_process_content(element.text) to process the content
        3. Call xml_process_element() for every nested element
        4. Call xml_process_tail(element.tail) to process the tail
        5. Call xml_postprocess_tag(element) for tag actions when leaving

        """

        self.xml_preprocess_tag(element)
        if element.text:
            self.xml_process_content(element.text)
        if list(element):
            for child in element:
                self.xml_process_element(child)
        if element.tail is not None and element.tail.strip():
            self.xml_process_tail(element.tail.strip())
        self.xml_postprocess_tag(element)
    
    def xml_preprocess_tag(self, element):
        """ Take any action that is triggered by the tag when entering the 
        element."""
        pass
    
    def xml_process_content(self, element):
        pass
    
    def xml_process_tail(self, element):
        pass
    
    def xml_postprocess_tag(self, element):
        """ Take any action that is triggered by the tag when leaving the 
        element."""
        pass

    def tag_token(self, token_id, tag, attributes, op=False, cl=False):
        """
        Add a tag to the specified token.
        
        Parameters
        ----------
        token_id : int 
            The ID of the token to be tagged 
        tag : str 
            The tag type
        attributes : dict
            A dictionary containing the attributes for the tag 
        op, cl: bool 
            Set 'op' to True if the tag is an opening tag. Set 'cl' to 
            True if the tag is a closing tag. If neither or both are True, 
            a ValueError exception is raised.
        """
        if (op and cl) or (not op and not cl):
            raise ValueError
        if op:
            tag_type = "open"
        else:
            tag_type = "close"
            
        self.table(self.tag_table).add(
            {self.tag_label: "{}".format(tag),
                self.tag_corpus_id: token_id,
                self.tag_type: tag_type,
                self.tag_attribute: ", ".join(
                    ["{}={}".format(x, attributes[x]) for x in attributes])})

    def tag_next_token(self, tag, attributes):
        """ Add an entry to the tag table that marks the next corpus_id.
        
        The tag is marked as an opening tag and contains the name ``tag`` 
        and a string representation of the dictionary ``attributes``. 
        
        The closing counterpart can be added by calling 
        :func:`tag_last_token`.
        
        Parameters
        ----------
        tag : string
            The name of the tag
        attributes : dict
            A dictionary containing the attributes of the opening tag.
            
        """
        self.table(self.tag_table).add(
            {self.tag_label: "{}".format(tag),
                self.tag_corpus_id: self._corpus_id + 1,
                self.tag_type: "open",
                self.tag_attribute: ", ".join(
                    ["{}={}".format(x, attributes[x]) for x in attributes])})

    def tag_last_token(self, tag, attributes):
        """ 
        Add an entry to the tag table that marks the last corpus_id.
        
        The tag is marked as a closing tag and contains the name `tag` and a 
        string representation of the dictionary `attributes`.
        
        The opening counterpart can be added by calling
        :func:`tag_next_token`.

        
        Parameters
        ----------
        tag : string
            The name of the tag
        attributes : dict
            A dictionary containing the attributes of the closing tag.
        """
        
        self.table(self.tag_table).add(
            {self.tag_label: "{}".format(tag),
                self.tag_corpus_id: self._corpus_id - 1,
                self.tag_type: "close",
                self.tag_attribute: ", ".join(
                    ["{}={}".format(x, attributes[x]) for x in attributes])})

    def add_empty_tag(self, tag, attributes):
        """ 
        Add an empty tag after the current corpus element.
        
        This method is usually called from within :func:`process_file` or a
        related method. It will add an entry to the tag table so that 
        an empty tag is inserted into the corpus after the current corpus
        element. This empty tag has the name ``tag`` and the attributes 
        given in ``attributes``.
        
        Parameters
        ----------
        tag : string
            The name of the tag
        attributes : dict
            A dictionary containing the attributes of the empty tag.
            
        Examples
        --------
        Let's assume that the corpus file contains an empty XML tag that 
        serves as a placeholder for graphics that are contained in the 
        original texts. In the ICE-NG files, such a placeholder is indicated
        by ``<object type="graphic">``. In order to store this information
        in the tag table, the corpus installer may have the line 
        ``self.add_empty_tag("object", {"type": "graphic"})`` in the 
        reimplementation of :func:`process_file` so that the method is 
        called the placeholder tag is encountered in the source files.
        """
        self.table(self.tag_table).add(
            {self.tag_label: "{}".format(tag),
             self.tag_corpus_id: self._corpus_id + 1,
             self.tag_type: "empty",
             self.tag_attribute: ", ".join(
                ["{}={}".format(x, attributes[x]) for x in attributes])})

    ### END XML

    def process_file(self, file_name):
        """
        Pass the file name to a processing method.
        
        This method passes the file name to a method that reads the content 
        from the file, parses the information relevant for the corpus, and
        stores the information to the database. The default implementation
        always calls :func:``process_text_file`` and assumes that the file 
        is a plain text file. 
        
        Subclasses of BaseCorpusBuilder should override this method so that
        the appropriate method is called for the file. In this way, it is
        possible for example to treat some files as plain text files by
        calling :func:``process_text_file`` on them, and other files as 
        XML files by calling :func:``process_xml_file``.
        
        Parameters
        ----------
        file_name : string
            The path name of the file that is to be processed

        """
        self.process_text_file(file_name)

    def build_load_files(self):
        """ Goes through the list of suitable files, and calls process_file()
        on each file name. File names are added to the file table.""" 
        self._file_list = self.get_file_list(self.arguments.path, self.file_filter)
        if not self._file_list:
            self.logger.warning("No files found at %s" % self.arguments.path)
            return

        if self._widget:
            self._widget.progressSet.emit(len(self._file_list), "")
            self._widget.progressUpdate.emit(0)
            
        for i, file_name in enumerate(self._file_list):
            if self._widget:
                self._widget.labelSet.emit(self._read_file_formatter.format(file=file_name))

            if self.interrupted:
                return
            if not self.db_has(self.file_table, {self.file_path: file_name}):
                self.logger.info("Loading file %s" % (file_name))
                self.store_filename(file_name)
                self.process_file(file_name)
            if self._widget:
                self._widget.progressUpdate.emit(i + 1)
            self.commit_data()

    def db_has(self, table, values, case=False):
        return len(self.db_find(table, values, case).index) > 0

    def db_find(self, table, values, case=False):        
        """ 
        Obtain all records from table_name that match the column-value
        pairs given in the dict values.
        
        Parameters
        ----------
        table_name : str 
            The name of the table 
        values : dict 
            A dictionary with column names as keys and cell contents as values
        case : bool
            Set to True if the find should be case-sensitive, or False 
            otherwise.
            
        Returns
        -------
        df : pandas data frame
            a data frame containing the matching entries
        """
        
        variables = list(values.keys())
        where = []
        for column, value in values.items():
            where.append("{} = '{}'".format(column, str(value).replace("'", "''")))
        if case:
            S = "SELECT {} FROM {} WHERE BINARY {}".format(", ".join(variables), table, " AND BINARY ".join(where))
        else:
            S = "SELECT {} FROM {} WHERE {}".format(", ".join(variables), table, " AND ".join(where))
        S = S.replace("\\", "\\\\")
        return pd.read_sql(S, self.DB.engine)


    def build_create_frequency_table(self):
        """ 
        Create a frequency table for all combinations of corpus features.
        
        This method creates a database table named 'coq_frequency_count' with 
        all corpus features as columns, and a row for each comination of 
        corpus features that occur in the corpus. The last column 'Count' 
        gives the number of tokens in the corpus that occur in the corpus.
        
        The frequency table can be used to look up quickly the size of a 
        subcorpus as well as the overall corpus. This is important for 
        reporting frequency counts as per-million-word frequencies.
        """
        #print(options.cfg.current_resources)
        #print(self.get_name())
        #resource = options.cfg.current_resources[self.name][0]
        #print(resource.get_corpus_features())
        pass
        #print(self.module_content)
        #print(self.name)

        #module = importlib.import_module("..{}".format(self.name), "installer.{}".format(self.name))

        #exec self.resource_content
        ##print(self.resource_content)

        #module_path = os.path.join(self.arguments.corpus_path, "{}.py".format(self.name))
        #module_path = "/home/kunibert/Dev/coquery/coquery/corpora/ice_ng.py"
        #print(module_path)
        #print(sys.modules.keys())
        #module = imp.load_source(self.name, module_path)
        #print(module, dir(module))
        ##resource = module.Resource
        ##print(resource)
        
    def build_lookup_ngram(self):
        """
        Create a lookup table for multi-item query strings.
        """
        S = "SELECT MAX({}) FROM {}".format(self.corpus_id, self.corpus_table)
        with self.DB.engine.connect() as connection:
            result = connection.execute(S).fetchone()
        max_id = result[0]
        logger.info("Creating lookup table, max_id is {}".format(max_id))

        self.corpusngram_table = "{}Ngram".format(self.corpus_table)
        self.corpusngram_width = int(self.arguments.ngram_width)
        if hasattr(self, "corpus_word_id"):
            word_id = self.corpus_word_id
            max_word = self._new_tables[self.word_table]._current_id + 1
        elif hasattr(self, "corpus_word"):
            word_id = self.corpus_word
            max_word = DEFAULT_MISSING_VALUE

        corpus_columns = [x.name for x in self._new_tables[self.corpus_table].columns if x.name != word_id]
        word_columns = ["{}{}".format(word_id, i+1) for i in range(self.arguments.ngram_width)]
        base_fields = ["coq_corpus_1.{}".format(x) for x in corpus_columns]
        additional_words = ["coq_corpus_{}.{}".format(i+1, word_id) for i in range(self.arguments.ngram_width)]
        table_join = ["{} AS coq_corpus_{}".format(self.corpus_table, i+1) for i in range(self.arguments.ngram_width)]
        join_conditions = ["coq_corpus_1.{id} + {n} = {join_corpus}.{id}".format(
            corpus=self.corpus_table, 
            id=self.corpus_id,
            join_corpus="coq_corpus_{}".format(i+1),
            n=i) for i in range(1, self.arguments.ngram_width)]

        step = 50000
        current_id = 0

        iterations = max_id // step + 1

        #if not Verbose:
            #ProgressBar = progressbar.ProgressBar (Iterations)
        #ToLog ("Inserting data, %s iterations" % Iterations, logging.info)

        corpus_tab = self._new_tables[self.corpus_table]
        
        new_tab_desc = []
        d = {}
        for x in corpus_tab.columns:
            d[x.name] = x
        
        for col in corpus_tab.columns:
            if col.name != word_id:
                if col.is_identifier:
                    new_col = Identifier(col.name, col.data_type, unique=col.unique)
                else:
                    new_col = Column(col.name, col.data_type)
                new_tab_desc.append(new_col)

        word_col = d[word_id]
        for x in word_columns:
            new_tab_desc.append(Column(x, word_col.data_type))

        self.create_table_description(self.corpusngram_table, new_tab_desc)
        create_str = self._new_tables[self.corpusngram_table].get_create_string(self.arguments.db_type)
        self.DB.create_table(self.corpusngram_table, create_str)

        sql_template = """
            INSERT {corpus_ngram} ({columns})
            SELECT {fields}
            FROM {join}
            WHERE {token_range} {join_str}"""

        if self._widget:
            self._widget.progressSet.emit(1 + ((max_id-1) // step), "Creating ngram lookup table... (chunk %v of %m)")
            self._widget.progressUpdate.emit(1)

        _chunk = 1
        with self.DB.engine.connect() as connection:
            while current_id <= max_id and not self.interrupted:
                token_range = "coq_corpus_1.{token} >= {lower} AND coq_corpus_1.{token} < {upper}".format(
                    token=self.corpus_id,
                    lower=current_id, upper=current_id + step)

                if join_conditions:
                    join_str = " AND {}".format(" AND ".join(join_conditions))
                else:
                    join_str=""
                
                S = sql_template.format(corpus_ngram=self.corpusngram_table,
                            columns=", ".join(corpus_columns + word_columns),
                            fields=", ".join(base_fields + additional_words),
                            join=", ".join(table_join),
                            token_range=token_range,
                            join_str=join_str
                            )

                connection.execute(S.strip().replace("\n", " "))
                current_id = current_id + step
                _chunk += 1
                if self._widget:
                    self._widget.progressUpdate.emit(_chunk)
            
            # insert missing rows so that the whole corpus is searchable:
            for n in range(1, self.arguments.ngram_width):
                token_range = "coq_corpus_1.{token} = {current}".format(
                    token=self.corpus_id, current=max_id - self.arguments.ngram_width + n + 1)
                base_fields = [("coq_corpus_1.{}".format(x) 
                                if x != self.corpus_id 
                                else str(max_id - self.arguments.ngram_width + n + 1)) for x in corpus_columns]
                
                table_join = ["{} AS coq_corpus_{}".format(self.corpus_table, i+1) for i in range(self.arguments.ngram_width - n)]
                join_conditions = ["coq_corpus_1.{id} + {i} = {join_corpus}.{id}".format(
                    corpus=self.corpus_table, 
                    id=self.corpus_id,
                    join_corpus="coq_corpus_{}".format(i+1),
                    i=i) for i in range(1, self.arguments.ngram_width - n)]
                additional_words = ["coq_corpus_{}.{}".format(i+1, word_id) for i in range(self.arguments.ngram_width - n)]
                
                if join_conditions:
                    join_str = " AND {}".format(" AND ".join(join_conditions))
                else:
                    join_str=""
                
                S = sql_template.format(corpus_ngram=self.corpusngram_table,
                            columns=", ".join(corpus_columns + word_columns),
                            fields=", ".join(base_fields + additional_words + [str(max_word)] * n),
                            join=", ".join(table_join),
                            token_range=token_range,
                            join_str=join_str
                            )
                connection.execute(S.strip().replace("\n", " "))
    
    def build_optimize(self):
        """ Optimizes the table columns so that they use a minimal amount
        of disk space."""
        totals = 0
        #for current_table in self.table_description:
            #totals += len(self.table_description[current_table]["CREATE"])

        for table in self._new_tables:
            totals += len(self._new_tables[table].columns)

        totals -= 1
        
        if self._widget:
            self._widget.progressSet.emit(totals, "Optimizing table columns... (%v of %m)")
            self._widget.progressUpdate.emit(0)
            
        column_count = 0

        for table_name in self._new_tables:
            table = self._new_tables[table_name]
            if self.interrupted:
                return

            for column in table.columns:
                # Links should get the same optimal data type as the linked 
                # column:
                if column.key:
                    try:
                        _table = self._new_tables[column._link]
                        _column = _table.primary
                        optimal = self.DB.get_optimal_field_type(_table.name, _column.name).strip()
                    except TypeError:
                        continue
                else:
                    try:
                        optimal = self.DB.get_optimal_field_type(table.name, column.name).strip()
                    except TypeError:
                        continue
                current = self.DB.get_field_type(table.name, column.name).strip()
                if current.lower() != optimal.lower() and "text" not in optimal.lower().split()[0].strip():
                    optimal = utf8(optimal)
                    try:
                        self.DB.modify_field_type(table.name, column.name, optimal)
                    except Exception as e:
                        print(e)
                        self.logger.warning(e)
                    else:
                        self.logger.info("Optimized column {}.{} from {} to {}".format(
                            table.name, column.name, current, optimal))
                        column.data_type = optimal
                column_count += 1

                if self._widget:
                    self._widget.progressUpdate.emit(column_count + 1)

        if self.interrupted:
            return
        
    def build_create_indices(self):
        """ 
        Create a MySQL index for each column in the database. 
        
        In Coquery, each column of a corpus table can be included in the
        output, and the columns are also available for filtering. As access
        to MySQL columns can be very significantly faster if the column is
        indexed, the corpus builder creates indices for any data column.
        
        The downside is that indexing may take considerable time for larger
        corpora such as the British National Corpus or the Corpus of 
        Contemporary American English. Indices also increase the disk space
        required to store the corpus database.
        
        However, the performance increase won by indexing usually clearly 
        outweighs these disadvantages.
        """
        index_list = []
        for table_name in self._new_tables:
            table = self._new_tables[table_name]
            for column in table.columns:
                if not isinstance(column, Identifier):
                    index_list.append((table.name, column.name))

        if self._widget:
            self._widget.progressSet.emit(len(index_list), "Creating indices... (%v of %m)")
            self._widget.progressUpdate.emit(0)

        index_count = 0

        i = 0
        for table, column in index_list:
            if self.interrupted:
                return

            try:
                this_column = self._new_tables[table].get_column(column)
                
                # indices for TEXT/BLOB columns require a key length:
                if this_column.base_type.endswith("TEXT") or this_column.base_type.endswith("BLOB"):
                    if this_column.index_length:
                        length = this_column.index_length
                    else:
                        length = sqlhelper.get_index_length(self.DB.engine, table, column)
                else:
                    length = None
                
                self.DB.create_index(table, column, [column], index_length=length)
            except Exception as e:
                print(e)
                self.logger.warning(e)
            
            i += 1
            if self._widget:
                self._widget.progressUpdate.emit(i + 1)

    @staticmethod
    def get_class_variables():
        return dir(BaseCorpusBuilder)

    def set_query_items(self):
        """
        Set up the mapping between query item types and resource features.
        
        This method generates the attributes that specifies in the corpus 
        module in which field the program should look when looking for word, 
        lemma, transcription, or part-of-speech information. These mappings 
        can be set in the corpus installer by using the add_query_item() 
        method.
        
        If no specific add_query_item() method is called for a query type, 
        a list of default resource features will be probed. If one of the 
        resource features are provided by the corpus, that feature will be 
        used when evaluating the respective query item.

        Mappings are realized by instanciating the class attributes 
        query_item_word, query_item_lemma, query_item_transcript, 
        query_item_gloss, and query_item_pos. If no mapping has been 
        set for one of these query item types either by explicitly calling 
        add_query_item() or by providing a resource feature that is in the 
        default lists, that attribute will not be provided by the corpus 
        module. In effect, that query item type will not be available for 
        that corpus.
        
        These are the default resource features (in order; the first will be 
        considered first):
        
        Query item type Resource features
        --------------------------------------------------------------------
        Word            word_label, corpus_label
        Lemma           lemma_label, word_lemma, corpus_lemma
        Transcript      transcript_label, word_transcript, corpus_transcript
        Gloss           gloss_label, word_gloss, lemma_gloss, corpus_gloss
        POS             pos_label, word_pos, lemma_pos, corpus_pos
        
        """
        l = []
        if not hasattr(self, "query_item_word"):
            for x in ["word_label", "corpus_word"]:
                if hasattr(self, x):
                    self.query_item_word = x
                    break
        if not hasattr(self, "query_item_lemma"):
            for x in ["lemma_label", "word_lemma", "corpus_lemma"]:
                if hasattr(self, x):
                    self.query_item_lemma = x
                    break
        if not hasattr(self, "query_item_transcript"):
            for x in ["transcript_label", "word_transcript", "corpus_transcript"]:
                if hasattr(self, x):
                    self.query_item_transcript = x
                    break
        if not hasattr(self, "query_item_pos"):
            for x in ["pos_label", "word_pos", "lemma_pos", "corpus_pos"]:
                if hasattr(self, x):
                    self.query_item_pos = x
                    break
        if not hasattr(self, "query_item_gloss"):
            for x in ["gloss_label", "word_gloss", 
                        "lemma_gloss", "corpus_gloss"]:
                if hasattr(self, x):
                    self.query_item_gloss = x
                    break

    def set_surface_feature(self, rc_feature):
        """
        Set the surface feature, i.e. the one that is used to display the 
        context of tokens either in the context viewer or in the results 
        table. By default, the surface feature is the same as the word query
        feature.
        
        Parameters
        ----------
        rc_feature : string 
            The feature that will be used as the new surface feature.
        """
        self.surface_feature = rc_feature

    def verify_corpus(self):
        """
        Apply some basic checks to determine whether a MySQL database is
        available to the corpus module.
        
        This method first checks whether a database under the given name is
        exists on the MySQL server. It then tests whether the database
        contains all data tables specified in the table descriptions defined 
        by previous calls to :func:`create_table_description`.
        
        Returns
        -------
        bool : boolean
            True if the database and all tables in the table
            descriptions exist, or False otherwise.
        """
        no_fail = True
        if not sqlhelper.has_database(options.cfg.current_server, self.arguments.db_name):
            no_fail = False
            self.logger.warning("Database {} not found.".format(self.arguments.db_name))
        for x in self.table_description:
            if not sqlhelper.has_table(self.DB.engine, x):
                self.logger.warning("Table {} not found.".format(x))
                no_fail = False
        return no_fail
    
    def get_module_path(self, name):
        """
        Return the path to the corpus module that is written during a build.
        
        Parameters
        ----------
        name : str 
            The name of the corpus
        
        Returns
        -------
        path : str 
            The path to the corpus module.
        """
        return os.path.join(options.cfg.corpora_path, "{}.py".format(name))

    def build_write_module(self):
        """ Write a Python module with the necessary specifications to the
        Coquery corpus module directory."""
        if not self.arguments.w:
            return
        base_variables = type(self).get_class_variables()

        # set_query_items() initializes those class variables that map the 
        # different query item types to resource features from the class.
        # In order to make these mappings available in the corpus module,
        # this is called before the module is written:
        self.set_query_items()

        # all class variables that are defined in this class and which...
        # - are not stored in the base class
        # - do not start with an underscore '_'
        # - are not class methods
        # are considered to be part of the database specification and will
        # be included with their value in the Python code:
        
        variable_names = [x for x in dir(self) 
                          if x not in base_variables 
                          and not x.startswith("_")
                          and not inspect.ismethod(getattr(self, x))]
        variable_strings = []
        for variable_name in sorted(variable_names):
            value = getattr(self, variable_name)
            if isinstance(value, (int, float)):
                format_str = "    {} = {}"
            else:
                format_str = "    {} = '{}'"
            variable_strings.append(format_str.format(variable_name, value))
        variable_code = "\n".join(variable_strings)
        
        self.module_content = self.module_code.format(
                name=self.name,
                display_name=utf8(self.get_name()),
                db_name=utf8(self.arguments.db_name),
                url=utf8(self.get_url()),
                variables=utf8(variable_code),
                corpus_code=utf8(self.get_corpus_code()),
                lexicon_code=utf8(self.get_lexicon_code()),
                resource_code=utf8(self.get_resource_code()))
        self.module_content = self.module_content.replace("\\", "\\\\")
        path = self.get_module_path(self.arguments.db_name)
        # write module code:
        with codecs.open(path, "w", encoding="utf-8") as output_file:
            output_file.write(self.module_content)
            self.logger.info("Corpus module %s written." % path)
            
    def setup_db(self):
        """ 
        Create a connection to the server, and creates the database if
        necessary.
        """
        configuration = options.cfg.current_server

        if self.arguments.c:
            if sqlhelper.has_database(configuration, self.arguments.db_name):         
                sqlhelper.drop_database(configuration, self.arguments.db_name)
            sqlhelper.create_database(configuration, self.arguments.db_name)

        self.DB = sqlwrap.SqlDB(
            Host=self.arguments.db_host,
            Port=self.arguments.db_port,
            Type=self.arguments.db_type,
            User=self.arguments.db_user,
            Password=self.arguments.db_password,
            db_name=self.arguments.db_name,
            local_infile=1)

        self.DB.use_database(self.arguments.db_name)

    def add_building_stage(self, stage):
        """ The parameter stage is a function that will be executed
        after the database tables have been created and the data data files
        have been processed, but before the tables are optimized and
        indexed. More than one function can be added."""
        self.additional_stages.append(stage)

    @staticmethod
    def get_license():
        return "(license not specified)"

    @staticmethod
    def get_title():
        return "(no title)"

    @staticmethod
    def get_language():
        "Return the corpus language as a string."
        return "(unspecified)"
    
    @staticmethod
    def get_language_code():
        """
        Return the ISO 639-1 code for the corpus language. Variant sub-codes 
        may be linked by a hyphen, e.g. "en-NG" for Nigerian English".
        """
        return "(unspecified)"

    @staticmethod
    def get_description():
        return []

    @staticmethod
    def get_references():
        return []

    @staticmethod
    def get_url():
        return "(no URL)"

    @staticmethod
    def get_modules():
        """
        Return the Python modules that are required by this builder class.
        
        Returns
        -------
        l : list of tuples
            A list of tuples describing the Python modules that are required 
            by this module. Each tuple consists of the module name, the 
            package name, and the URL for this package.
        """
        return []
    
    @staticmethod
    def get_name():
        return "(unnamed)"
    
    @staticmethod
    def get_db_name():
        return "unnamed"

    @staticmethod
    def get_installation_note():
        return ""

    def build_initialize(self):
        """ 
        Initialize the corpus build.
        """

        self.start_time = time.time()
        self.logger.info("--- Starting ---")
        self.logger.info("Building corpus %s" % self.name)
        self.logger.info("Command line arguments: %s" % " ".join(sys.argv[1:]))
        if not self._widget:
            print("\n%s\n" % textwrap.TextWrapper(width=79).fill(" ".join(self.get_description())))
            
        # Corpus installers may require additional modules. For example, 
        # Gabra is currently distributed as MongoDB files, which are read by 
        # using the pymongo library.
        # Unless the user wishes to install only the corpus module, try to 
        # import these additional modules, and raise an exception if they are 
        # unavailable:
        if not self.arguments.only_module:
            for module, package, url in self.get_modules():
                try:
                    exec("import {}".format(module))
                except ImportError:
                    raise DependencyError(package, url)
        if self.arguments.use_nltk:
            import nltk
            
        if self.DB.db_type == SQL_MYSQL:
            self.DB.connection.execute("SET NAMES 'utf8'")
            self.DB.connection.execute("SET CHARACTER SET 'utf8mb4'")
            self.DB.connection.execute("SET unique_checks=0")
            self.DB.connection.execute("SET foreign_key_checks=0")

    def remove_build(self):
        """
        Remove everything that has been built so far.
        
        This method removes the corpus components that have just been 
        built, e.g. if the build was interrupted by the user or becauee 
        an exception occurred.
        
        It attempts to remove the following:
        
        - the database (if the database was constructed during the build)
        - the corpus module
        - the corpus installer in case of adhoc corpora
        """
        if self.arguments.c:
            #try:
                #sqlhelper.drop_database(options.cfg.current_server, self.arguments.db_name)
            #except:
                pass

        path = self.get_module_path(self.arguments.name)
        try:
            os.remove(path)
        except:
            pass
        
        path = os.path.join(options.cfg.adhoc_path, "coq_install_{}.py".format(self.arguments.db_name))
        try:
            os.remove(path)
        except:
            pass
        
    def build_finalize(self):
        """ Wrap up everything after the corpus installation is complete. """
        if self.interrupted:
            self.remove_build()
            self.logger.info("--- Interrupted building {} (after {:.3f} seconds) ---".format(self.name, time.time() - self.start_time))
        else:
            self.logger.info("--- Done building {} (after {:.3f} seconds) ---".format(self.name, time.time() - self.start_time))
        
    def build(self):
        """ 
        Build the corpus database, and install the corpus module.
        
        This method runs all steps required to make the data from a corpus 
        available to Coquery. Most importantly, it calls these functions (in
        order):
        
        * :func:`build_initialize` to set up the building process, which includes loading the required modules`
        * :func:`build_create_tables` to create all SQL tables that were specified by previous calls to :func:`create_table_description`
        * :func:`build_load_files` to read all datafiles, process their content, and insert the content into the SQL tables
        * :func:`build_lookup_ngram` to create an n-gram lookup table that increases query performance of multi-item queries, but which requires a lot of disk space
        * :func:`build_optimize` to ensure that the SQL tables use the optimal data format for the data
        * :func:`build_create_indices` to create database indices that speed up the SQL queries
        * :func:`build_write_module` to write the corpus module to the ``corpora`` sub-directory of the Coquery install directory (or the corpus directory specified in the configuration file)
        
        .. note:: 
        
            Self-joined tables are currently not supported by 
            :class:`BaseCorpusBuilder`. Corpus installers that want to use
            this feature have to override :func:`build_lookup_ngram`.
        """

        def progress_next(count):
            if self._widget:
                count += 1
                self._widget.generalUpdate.emit(count)
                self._widget.progressSet.emit(0, "")
            return count
        
        def progress_done():
            if self._widget:
                self._widget.progressUpdate.emit(0)
        
        self.check_arguments()
        if not self._widget:
            self.setup_logger()

        if self._widget:
            steps = 2 + int(self.arguments.c) + int(self.arguments.l) + int(self.arguments.lookup_ngram) + int(self.additional_stages != []) + int(self.arguments.o) + int(self.arguments.i) 
            self._widget.ui.progress_bar.setMaximum(steps)

        if (self.arguments.l or self.arguments.c) and not self.validate_path(self.arguments.path):
            raise RuntimeError("The given path {} does not appear to contain valid corpus data files.".format(self.arguments.path))
        
        current = 0

        current = progress_next(current)
        self.setup_db()
        with self.DB.engine.connect() as self.DB.connection:
            self.build_initialize()
            progress_done()

            try:
                if self.arguments.c and not self.interrupted:
                    current = progress_next(current)
                    self.build_create_tables()
                    progress_done()
                else:
                    # At the very least, the tag table features are added so
                    # that the corpus module will always contain that table.
                    self.add_tag_table(features_only=True)
            
                if self.arguments.l and not self.interrupted:
                    current = progress_next(current)
                    self.build_load_files()
                    self.commit_data()
                    progress_done()

                try:
                    if self.arguments.lookup_ngram and not self.interrupted:
                        current = progress_next(current)
                        self.build_lookup_ngram()
                        progress_done()
                except Exception as e:
                    logger.error("Error building ngram lookup: {}".format(e))
                    print(e)

                if not self.interrupted:
                    current = progress_next(current)
                    for stage in self.additional_stages:
                        if not self.interrupted:
                            stage()
                    progress_done()

                if self.arguments.o and not self.interrupted:
                    current = progress_next(current)
                    self.build_optimize()
                    progress_done()

                if self.arguments.i and not self.interrupted:
                    current = progress_next(current)
                    self.build_create_indices()
                    progress_done()
                    
                if self.verify_corpus() and not self.interrupted:
                    current = progress_next(current)
                    self.build_write_module()
                    current = progress_next(current)

                if not self.interrupted:
                    current = progress_next(current)
                    self.build_create_frequency_table()
                    progress_done()

                self.build_finalize()
            except Exception as e:
                for x in get_error_repr(sys.exc_info()):
                    print(x)
                warnings.warn(str(e))
                print(str(e))
                self.remove_build()
                raise e
        self.DB.engine.dispose()
        
    def create_installer_module(self):
        """
        Read the Python source of coq_install_generic.py, and modify it so that it 
        can be stored as an adhoc installer module.
        """
        with codecs.open(os.path.join(
                            options.cfg.installer_path,
                            "coq_install_generic.py"), 
                        "r") as input_file:
            source = input_file.readlines()
            
        if self.arguments.use_nltk:
            import nltk
            is_tagged_label = "POS-tagged text corpus"
            try:
                tagging_state = "Part-of-speech tags were assigned using <code>{}</code>, the recommended tagger from the Natural Language Toolkit (NLTK) version {}. NLTK used data from WordNet for lemmatization.".format(
                    nltk.tag._POS_TAGGER.split("/")[1],
                    nltk.__version__)
            except:
                tagging_state = "Part-of-speech tags were assigned using the default tagger from the Natural Language Toolkit (NLTK) version {}. NLTK used data from WordNet for lemmatization.".format(
                    nltk.__version__)
        else:
            is_tagged_label = "text corpus"
            tagging_state = "Part-of-speech tags are not available for this corpus."
        
        description = ["<p>The {label} '{name}' was created on {date}. It contains {tokens} text tokens. {tagging_state}</p><p>Directory:<br/> <code>{path}</code></p><p>File{s}:<br/><code>{files}</code></p><p>".format(
            label = utf8(is_tagged_label),
            date = utf8(time.strftime("%c")),
            user = utf8(getpass.getuser()),
            name = utf8(self.arguments.name),
            path = utf8(self.arguments.path),
            s = "s" if len(self._file_list) > 1 else "",
            files = "<br/>".join([utf8(os.path.basename(x)) for x in sorted(self._file_list)]),
            tokens = self._corpus_id,
            tagging_state = utf8(tagging_state))]
        
        new_code = new_code_str.format(
            name=self.name, 
            db_name=self.arguments.db_name,
            is_tagged_corpus=is_tagged_label,
            description=" ".join(description))
        new_code = new_code.replace("\\", "\\\\")
        in_class = False
        in_docstring = False
        in_get_description = False

        for x in source:
            if self.arguments.use_nltk:
                if x.strip().startswith("def __init__") and "pos=False" in x:
                    x = x.replace("pos=False", "pos=True")

            if x.startswith('"""'):
                if not in_docstring:
                    in_docstring = True
                    yield '"""'
                    yield new_doc_string
                else:
                    in_docstring = False
            if in_docstring:
                continue
            
            if not in_class:
                if x.startswith("class BuilderClass"):
                    in_class = True
                    yield x
                    continue
            else:
                if not x.startswith("    "):
                    in_class = False
                    yield new_code

            if self.arguments.use_nltk:
                if x.strip().startswith("word_lemma ="):
                    yield "    word_pos = 'POS'\n"
            yield x

