svd2vec
¶The corpus (documents
) parameter of svd2vec
should be a list of documents. Each document should be a list of words representing that document.
# saving the word2vec corpus locally
import requests, zipfile, io
url = "http://mattmahoney.net/dc/text8.zip"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
# loading the word2vec demo corpus as a single document
documents = [open("text8", "r").read().split(" ")]
from svd2vec import svd2vec
# showing first fifteen words of each documents
[d[:15] + ['...'] for d in documents]
# creating the words representation (can take a while)
svd = svd2vec(documents, window=5, min_count=100, verbose=False)
svd.similarity("bad", "good")
svd.similarity("monday", "friday")
svd.distance("apollo", "moon")
svd.most_similar(positive=["january"], topn=2)
svd.analogy("paris", "france", "berlin")
svd.analogy("road", "cars", "rail", topn=5)
svd.analogy("cow", "cows", "pig")
svd.analogy("man", "men", "woman")
# saving to a binary format
svd.save("svd.binary")
# loading from binary file
loaded = svd2vec.load("svd.binary")
loaded.similarity("bad", "good")
# saving to a word2vec like representation
svd.save_word2vec_format("svd.word2vec")