text_classifier.py 文件源码-python代码片段

def __init__(self, texts, ids, vocabulary=None, encoding='utf-8'):
        """Definido en la declaracion de la clase.

        Attributes:
            texts (list of str): Textos a clasificar.
            ids (list of str): Identificadores únicos para cada texto (debe
                tener la misma longitud que `texts`).
            vocabulary (list): Opcional. Vocabulario a tener en cuenta para la
                vectorización de los textos. Default: usa todas las palabras
                presentes en los textos, salvo los ES_stopwords.txt.
            encoding (str): Codificación de los textos en `texts` y en `ids`.
        """
        this_dir, this_filename = os.path.split(__file__)
        es_stopwords = pd.read_csv(os.path.join(this_dir, 'ES_stopwords.txt'),
                                   header=None, encoding='utf-8')
        es_stopwords = list(np.squeeze(es_stopwords.values))
        self._check_id_length(ids)
        self.vectorizer = CountVectorizer(
            input='content', encoding=encoding, decode_error='strict',
            strip_accents='ascii', lowercase=True, preprocessor=None,
            tokenizer=None, stop_words=es_stopwords, ngram_range=(1, 1),
            analyzer='word', max_df=0.8, min_df=1, max_features=None,
            vocabulary=vocabulary, binary=False)

        self.transformer = TfidfTransformer()
        self.ids = None  # Matiene una lista ordenada de ids de textos.
        self.term_mat = None  # Matriz que cuenta los terminos en un texto.
        self.tfidf_mat = None  # Matriz de relevancia de los terminos.
        self.reload_texts(texts, ids)