svd2vec output¶Gensim is a Python library for topic modelling, document indexing and similarity retrieval with large corpora.
Gensim can use word2vec to compute similarity (and more!) between words. svd2vec can save it's vectors in a word2vec format that Gensim can process.
In this notebook it is shown how you can use Gensim with vectors learnt from svd2vec. We also compare our results with the pure word2vec model.
from svd2vec import svd2vec, FilesIO
from gensim.models import Word2Vec
from gensim.models.keyedvectors import Word2VecKeyedVectors
# Gensim does not have any implementation of an analogy method, so we add one here (3CosAdd)
def analogy_keyed(self, a, b, c, topn=10):
    return self.most_similar(positive=[b, c], negative=[a], topn=topn)
Word2VecKeyedVectors.analogy = analogy_keyed
def analogy_w2v(self, a, b, c, topn=10):
    return self.wv.most_similar(positive=[b, c], negative=[a], topn=topn)
Word2Vec.analogy = analogy_w2v
documents = FilesIO.load_corpus("text8")
svd2vec_svd = svd2vec(documents, size=300, window=5, min_count=100, verbose=False)
# we first need to export svd2vec_svd to the word2vec format
svd2vec_svd.save_word2vec_format("svd.word2vec")
# we then load the model using Gensim
gensim_svd = Word2VecKeyedVectors.load_word2vec_format("svd.word2vec")
import os
if not os.path.isfile("w2v.word2vec") or True:
    # we train the model using word2vec (needs to be installed)
    !word2vec -min-count 100 -size 300 -window 5 -train text8 -output w2v.word2vec
# we load it
word2vec_w2v = Word2VecKeyedVectors.load_word2vec_format("w2v.word2vec")
gensim_w2v = Word2Vec(documents, size=300, window=5, min_count=100, workers=16)
def compare_similarity(w1, w2):
    print("cosine similarity between", w1, "and", w2, ":")
    print("\tsvd2vec_svd ", svd2vec_svd.similarity(w1, w2))
    print("\tgensim_svd  ", gensim_svd.similarity(w1, w2))
    print("\tgensim_w2v  ", gensim_w2v.wv.similarity(w1, w2))
    print("\tword2vec_w2v", word2vec_w2v.similarity(w1, w2))
def compare_analogy(w1, w2, w3, topn=3):
    
    def analogy_str(model):
        a = model.analogy(w1, w2, w3, topn=topn)
        s = "\n\t\t".join(["{: <20}".format(w) + str(c) for w, c in a])
        return "\n\t\t" + s
    
    print("analogy similaties :", w1, "is to", w2, "as", w3, "is to?")
    print("\tsvd2vec_svd", analogy_str(svd2vec_svd))
    print("\tgensim_svd", analogy_str(gensim_svd))
    print("\tgensim_w2v", analogy_str(gensim_w2v))
    print("\tword2vec_w2v", analogy_str(word2vec_w2v))
compare_similarity("good", "bad")
compare_similarity("truck", "car")
compare_analogy("january", "month", "monday")
compare_analogy("paris", "france", "berlin")
compare_analogy("man", "king", "woman")
compare_analogy("road", "cars", "rail")
def compare_similarity(path, d='\t'):
    print("pearson correlation of", os.path.basename(path))
    print("\tsvd2vec_svd   ", svd2vec_svd.evaluate_word_pairs(path,   delimiter=d)[0])
    print("\tgensim_svd    ", gensim_svd.evaluate_word_pairs(path,    delimiter=d)[0][0])
    print("\tgensim_w2v    ", gensim_w2v.wv.evaluate_word_pairs(path, delimiter=d)[0][0])
    print("\tword2vec_w2v  ", word2vec_w2v.evaluate_word_pairs(path,  delimiter=d)[0][0])
    print("")
compare_similarity(FilesIO.path('similarities/wordsim353.txt'))
compare_similarity(FilesIO.path('similarities/men_dataset.txt'))
compare_similarity(FilesIO.path('similarities/mturk.txt'))
compare_similarity(FilesIO.path('similarities/simlex999.txt'))
compare_similarity(FilesIO.path('similarities/rarewords.txt'))
def compare_analogy(path):
    print("analogies success rate of", os.path.basename(path))
    print("\tsvd2vec_svd   ", svd2vec_svd.evaluate_word_analogies(path))
    print("\tgensim_svd    ", gensim_svd.evaluate_word_analogies(path)[0])
    print("\tgensim_w2v    ", gensim_w2v.wv.evaluate_word_analogies(path)[0])
    print("\tword2vec_w2v  ", word2vec_w2v.evaluate_word_analogies(path)[0])
compare_analogy(FilesIO.path('analogies/questions-words.txt'))
compare_analogy(FilesIO.path('analogies/msr.txt'))