|
| 1 | +#!/usr/bin/env python |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +# |
| 4 | +# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz> |
| 5 | +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html |
| 6 | + |
| 7 | +""" |
| 8 | +Scikit learn interface for gensim for easy use of gensim with scikit-learn |
| 9 | +Follows scikit-learn API conventions |
| 10 | +""" |
| 11 | + |
| 12 | +import numpy as np |
| 13 | +from scipy import sparse |
| 14 | +from sklearn.base import TransformerMixin, BaseEstimator |
| 15 | +from sklearn.exceptions import NotFittedError |
| 16 | + |
| 17 | +from gensim import models |
| 18 | +from gensim import matutils |
| 19 | + |
| 20 | + |
| 21 | +class HdpTransformer(TransformerMixin, BaseEstimator): |
| 22 | + """ |
| 23 | + Base HDP module |
| 24 | + """ |
| 25 | + |
| 26 | + def __init__(self, id2word, max_chunks=None, max_time=None, |
| 27 | + chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, |
| 28 | + gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, |
| 29 | + outputdir=None, random_state=None): |
| 30 | + """ |
| 31 | + Sklearn api for HDP model. See gensim.models.HdpModel for parameter details. |
| 32 | + """ |
| 33 | + self.gensim_model = None |
| 34 | + self.id2word = id2word |
| 35 | + self.max_chunks = max_chunks |
| 36 | + self.max_time = max_time |
| 37 | + self.chunksize = chunksize |
| 38 | + self.kappa = kappa |
| 39 | + self.tau = tau |
| 40 | + self.K = K |
| 41 | + self.T = T |
| 42 | + self.alpha = alpha |
| 43 | + self.gamma = gamma |
| 44 | + self.eta = eta |
| 45 | + self.scale = scale |
| 46 | + self.var_converge = var_converge |
| 47 | + self.outputdir = outputdir |
| 48 | + self.random_state = random_state |
| 49 | + |
| 50 | + def fit(self, X, y=None): |
| 51 | + """ |
| 52 | + Fit the model according to the given training data. |
| 53 | + Calls gensim.models.HdpModel |
| 54 | + """ |
| 55 | + if sparse.issparse(X): |
| 56 | + corpus = matutils.Sparse2Corpus(X) |
| 57 | + else: |
| 58 | + corpus = X |
| 59 | + |
| 60 | + self.gensim_model = models.HdpModel(corpus=corpus, id2word=self.id2word, max_chunks=self.max_chunks, |
| 61 | + max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, |
| 62 | + K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, |
| 63 | + var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state) |
| 64 | + return self |
| 65 | + |
| 66 | + def transform(self, docs): |
| 67 | + """ |
| 68 | + Takes a list of documents as input ('docs'). |
| 69 | + Returns a matrix of topic distribution for the given document bow, where a_ij |
| 70 | + indicates (topic_i, topic_probability_j). |
| 71 | + The input `docs` should be in BOW format and can be a list of documents like : [ [(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)] ] |
| 72 | + or a single document like : [(4, 1), (7, 1)] |
| 73 | + """ |
| 74 | + if self.gensim_model is None: |
| 75 | + raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.") |
| 76 | + |
| 77 | + # The input as array of array |
| 78 | + check = lambda x: [x] if isinstance(x[0], tuple) else x |
| 79 | + docs = check(docs) |
| 80 | + X = [[] for _ in range(0, len(docs))] |
| 81 | + |
| 82 | + max_num_topics = 0 |
| 83 | + for k, v in enumerate(docs): |
| 84 | + X[k] = self.gensim_model[v] |
| 85 | + max_num_topics = max(max_num_topics, max(list(map(lambda x: x[0], X[k]))) + 1) |
| 86 | + |
| 87 | + for k, v in enumerate(X): |
| 88 | + # returning dense representation for compatibility with sklearn but we should go back to sparse representation in the future |
| 89 | + dense_vec = matutils.sparse2full(v, max_num_topics) |
| 90 | + X[k] = dense_vec |
| 91 | + |
| 92 | + return np.reshape(np.array(X), (len(docs), max_num_topics)) |
| 93 | + |
| 94 | + def partial_fit(self, X): |
| 95 | + """ |
| 96 | + Train model over X. |
| 97 | + """ |
| 98 | + if sparse.issparse(X): |
| 99 | + X = matutils.Sparse2Corpus(X) |
| 100 | + |
| 101 | + if self.gensim_model is None: |
| 102 | + self.gensim_model = models.HdpModel(id2word=self.id2word, max_chunks=self.max_chunks, |
| 103 | + max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, |
| 104 | + K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, |
| 105 | + var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state) |
| 106 | + |
| 107 | + self.gensim_model.update(corpus=X) |
| 108 | + return self |
0 commit comments