svd2vec
output¶Gensim is a Python library for topic modelling, document indexing and similarity retrieval with large corpora.
Gensim can use word2vec
to compute similarity (and more!) between words. svd2vec
can save it's vectors in a word2vec
format that Gensim can process.
In this notebook it is shown how you can use Gensim with vectors learnt from svd2vec
. We also compare our results with the pure word2vec model.
from svd2vec import svd2vec, FilesIO
from gensim.models import Word2Vec
from gensim.models.keyedvectors import Word2VecKeyedVectors
# Gensim does not have any implementation of an analogy method, so we add one here (3CosAdd)
def analogy_keyed(self, a, b, c, topn=10):
return self.most_similar(positive=[b, c], negative=[a], topn=topn)
Word2VecKeyedVectors.analogy = analogy_keyed
def analogy_w2v(self, a, b, c, topn=10):
return self.wv.most_similar(positive=[b, c], negative=[a], topn=topn)
Word2Vec.analogy = analogy_w2v
documents = FilesIO.load_corpus("text8")
svd2vec_svd = svd2vec(documents, size=300, window=5, min_count=100, verbose=False)
# we first need to export svd2vec_svd to the word2vec format
svd2vec_svd.save_word2vec_format("svd.word2vec")
# we then load the model using Gensim
gensim_svd = Word2VecKeyedVectors.load_word2vec_format("svd.word2vec")
import os
if not os.path.isfile("w2v.word2vec") or True:
# we train the model using word2vec (needs to be installed)
!word2vec -min-count 100 -size 300 -window 5 -train text8 -output w2v.word2vec
# we load it
word2vec_w2v = Word2VecKeyedVectors.load_word2vec_format("w2v.word2vec")
gensim_w2v = Word2Vec(documents, size=300, window=5, min_count=100, workers=16)
def compare_similarity(w1, w2):
print("cosine similarity between", w1, "and", w2, ":")
print("\tsvd2vec_svd ", svd2vec_svd.similarity(w1, w2))
print("\tgensim_svd ", gensim_svd.similarity(w1, w2))
print("\tgensim_w2v ", gensim_w2v.wv.similarity(w1, w2))
print("\tword2vec_w2v", word2vec_w2v.similarity(w1, w2))
def compare_analogy(w1, w2, w3, topn=3):
def analogy_str(model):
a = model.analogy(w1, w2, w3, topn=topn)
s = "\n\t\t".join(["{: <20}".format(w) + str(c) for w, c in a])
return "\n\t\t" + s
print("analogy similaties :", w1, "is to", w2, "as", w3, "is to?")
print("\tsvd2vec_svd", analogy_str(svd2vec_svd))
print("\tgensim_svd", analogy_str(gensim_svd))
print("\tgensim_w2v", analogy_str(gensim_w2v))
print("\tword2vec_w2v", analogy_str(word2vec_w2v))
compare_similarity("good", "bad")
compare_similarity("truck", "car")
compare_analogy("january", "month", "monday")
compare_analogy("paris", "france", "berlin")
compare_analogy("man", "king", "woman")
compare_analogy("road", "cars", "rail")
def compare_similarity(path, d='\t'):
print("pearson correlation of", os.path.basename(path))
print("\tsvd2vec_svd ", svd2vec_svd.evaluate_word_pairs(path, delimiter=d)[0])
print("\tgensim_svd ", gensim_svd.evaluate_word_pairs(path, delimiter=d)[0][0])
print("\tgensim_w2v ", gensim_w2v.wv.evaluate_word_pairs(path, delimiter=d)[0][0])
print("\tword2vec_w2v ", word2vec_w2v.evaluate_word_pairs(path, delimiter=d)[0][0])
print("")
compare_similarity(FilesIO.path('similarities/wordsim353.txt'))
compare_similarity(FilesIO.path('similarities/men_dataset.txt'))
compare_similarity(FilesIO.path('similarities/mturk.txt'))
compare_similarity(FilesIO.path('similarities/simlex999.txt'))
compare_similarity(FilesIO.path('similarities/rarewords.txt'))
def compare_analogy(path):
print("analogies success rate of", os.path.basename(path))
print("\tsvd2vec_svd ", svd2vec_svd.evaluate_word_analogies(path))
print("\tgensim_svd ", gensim_svd.evaluate_word_analogies(path)[0])
print("\tgensim_w2v ", gensim_w2v.wv.evaluate_word_analogies(path)[0])
print("\tword2vec_w2v ", word2vec_w2v.evaluate_word_analogies(path)[0])
compare_analogy(FilesIO.path('analogies/questions-words.txt'))
compare_analogy(FilesIO.path('analogies/msr.txt'))