model[['my', 'document']]
FastText
- infer vector for each word and calculate vector of document as average of word-vectors
from gensim.corpora import Dictionary
from gensim.models import LsiModel
data = [["a", "a", "b"], ["c", "d"]]
dictionary = Dictionary(data)
corpus = [dictionary.doc2bow(doc) for doc in data]
model = LsiModel(corpus, id2word=dictionary)
list(model[corpus]) # [[(0, 2.236067977499789)], [(1, -1.4142135623730951)]]
dictionary = Dictionary(docs)
corpus = [ dictionary.doc2bow(doc) for doc in docs ]
model = LsiModel(corpus, id2word=dictionary, num_topics =10)
for doc in docs:
if len(doc) ==0:
print("doc: ",doc)
vecs = []
for doc in list(model[corpus]):
x = [ vec[1] for vec in doc ]
if len(x) == 0:
print("vec: ", x)
vecs.append(x)
vecs = np.array(vecs, float)
vec: []
vec: []
vec: []
vec: []
full2sparce
function https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/lsimodel.py#L486 (but believe me, you do not need it, these values are filtered for a reason, too small values like a noize, this doesn't contain any useful information).
while
training.
docs = [['this', 'is', 'some'], ['this', 'example'], ['example', 'is', 'this']]
model = FastText(workers=ncores, min_count=1)
corpus = { 'this': 1, 'is': 200, 'some': 5, 'example': 1 }
model.build_vocab_from_freq(corpus, corpus_count = len(docs))
model.train(docs, total_examples=model.corpus_count, epochs=model.iter)
vecs = []
for doc in docs:
docVecs = []
for token in doc:
docVecs.append(model[token])
vecs.append(np.mean(docVecs, axis=0))