Effect of corpus size on similarity scores

Data preparation

In [251]:
from svd2vec import svd2vec, FilesIO, Utils
from gensim.models import Word2Vec

import os
import time
import pandas as pd

%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (16, 5)
In [2]:
all_documents = FilesIO.load_corpus("text8")
In [253]:
def split_documents(n_corpuses=5):
    sizes = np.geomspace(len(all_documents), 15, num=n_corpuses, dtype=int)
    corpuses = [all_documents[:s] for s in sizes]
    return corpuses
In [254]:
def display_stats_corpus(corpus):
    def format_space(n):
        return '{:,}'.format(n).replace(',', ' ')
    print("=-=-=-=-=-=-=-=-=-=-=-=-")
    print("Documents: ", format_space(len(corpus)))
    print("Tokens:    ", format_space(sum([len(d) for d in corpus])))
    print("Vocabulary:", format_space(len(set.union(*[set(d) for d in corpus]))))
    print("")
def display_stats_corpuses(corpuses):
    for c in corpuses:
        display_stats_corpus(c)
In [255]:
corpuses = split_documents(n_corpuses=10)
In [256]:
display_stats_corpuses(corpuses)
=-=-=-=-=-=-=-=-=-=-=-=-
Documents:  1 701
Tokens:     17 005 208
Vocabulary: 253 855

=-=-=-=-=-=-=-=-=-=-=-=-
Documents:  1 005
Tokens:     10 050 000
Vocabulary: 189 702

=-=-=-=-=-=-=-=-=-=-=-=-
Documents:  594
Tokens:     5 940 000
Vocabulary: 138 809

=-=-=-=-=-=-=-=-=-=-=-=-
Documents:  351
Tokens:     3 510 000
Vocabulary: 104 471

=-=-=-=-=-=-=-=-=-=-=-=-
Documents:  207
Tokens:     2 070 000
Vocabulary: 79 693

=-=-=-=-=-=-=-=-=-=-=-=-
Documents:  122
Tokens:     1 220 000
Vocabulary: 59 551

=-=-=-=-=-=-=-=-=-=-=-=-
Documents:  72
Tokens:     720 000
Vocabulary: 42 215

=-=-=-=-=-=-=-=-=-=-=-=-
Documents:  42
Tokens:     420 000
Vocabulary: 30 242

=-=-=-=-=-=-=-=-=-=-=-=-
Documents:  25
Tokens:     250 000
Vocabulary: 22 309

=-=-=-=-=-=-=-=-=-=-=-=-
Documents:  15
Tokens:     150 000
Vocabulary: 15 624

In [232]:
sizes = [sum([len(d) for d in corpus]) for corpus in corpuses]

Training embeddings

In [233]:
size = 300
window = 5
min_count = np.logspace(2, 0, num=len(corpuses), dtype=int)
In [234]:
def train_embeddings_generic(corpuses, name, method):
    models = []
    total  = len(corpuses)
    start = time.time()
    for i, documents in enumerate(corpuses):
        print("Building", name, "embeddings", i + 1, "out of", total, "... ", end="", flush=True)
        s = time.time()
        if name == "svd":
            model = method(documents, size=size, window=window, min_count=min_count[i], workers=16, sub_threshold=1e-3)
        else:
            model = method(documents, size=size, window=window, min_count=min_count[i], workers=16)            
        models.append(model)
        print(time.time() - s, "s")
    end = time.time()
    print("Total time:", end-start, "s")
    return models

def train_embeddings_svd(corpuses):
    return train_embeddings_generic(corpuses, "svd", svd2vec)

def train_embeddings_w2v(corpuses):
    return train_embeddings_generic(corpuses, "w2v", Word2Vec)
In [235]:
BUILD_MODEL_SCRATCH = True
In [236]:
def save(models, name, ext):
    for i, model in enumerate(models):
        model.save(name + str(i + 1) + "." + ext)
def load(indexes, module, name, ext):
    return [module.load(name + str(i + 1) + "." + ext) for i in indexes]

if BUILD_MODEL_SCRATCH:
    svd_models = train_embeddings_svd(corpuses)
    save(svd_models, "svd_model_", "svd")
    
    w2v_models = train_embeddings_w2v(corpuses)
    save(w2v_models, "w2v_model_", "w2v")
else:
    svd_models = load(range(len(corpuses)), svd2vec, "svd_model_", "svd")
    w2v_models = load(range(len(corpuses)), Word2Vec, "w2v_model_", "w2v")
Building svd embeddings 1 out of 10 ... 706.2786853313446 s
Building svd embeddings 2 out of 10 ... 501.7367248535156 s
Building svd embeddings 3 out of 10 ... 341.3248236179352 s
Building svd embeddings 4 out of 10 ... 245.39279174804688 s
Building svd embeddings 5 out of 10 ... 183.73224020004272 s
Building svd embeddings 6 out of 10 ... 146.37917041778564 s
Building svd embeddings 7 out of 10 ... 127.93301200866699 s
Building svd embeddings 8 out of 10 ... 147.84809947013855 s
Building svd embeddings 9 out of 10 ... 244.36981296539307 s
Building svd embeddings 10 out of 10 ... 124.36313104629517 s
Total time: 2769.392218351364 s
Building w2v embeddings 1 out of 10 ... 58.6909613609314 s
Building w2v embeddings 2 out of 10 ... 34.75031590461731 s
Building w2v embeddings 3 out of 10 ... 21.14402985572815 s
Building w2v embeddings 4 out of 10 ... 11.848723888397217 s
Building w2v embeddings 5 out of 10 ... 7.3934125900268555 s
Building w2v embeddings 6 out of 10 ... 5.56996488571167 s
Building w2v embeddings 7 out of 10 ... 3.2016634941101074 s
Building w2v embeddings 8 out of 10 ... 2.2402801513671875 s
Building w2v embeddings 9 out of 10 ... 1.778069257736206 s
Building w2v embeddings 10 out of 10 ... 1.9220893383026123 s
Total time: 148.577397108078 s

Evaluation

In [238]:
files = [
    FilesIO.path('similarities/wordsim353.txt'),
    FilesIO.path('similarities/men_dataset.txt'),
    FilesIO.path('similarities/mturk.txt'),
    FilesIO.path('similarities/simlex999.txt'),
    FilesIO.path('similarities/rarewords.txt')
]
In [237]:
from scipy import stats
stats.spearmanr = Utils.confidence_pearson

def models_get_score(models, file):
    scores = []
    confidences = []
    for model in models:
        try:
            if hasattr(model, 'wv'):
                pearson, (score, p_value, low, high), oov_ratio = model.wv.evaluate_word_pairs(file)
                confidence = (score - low, high - score)
            else:
                score, p_value, (low, high) = model.evaluate_word_pairs(file)
                confidence = (score - low, high - score)
        except ValueError as e:
            score = 0
            confidence = (0, 0)
        scores.append(score)
        confidences.append(confidence)
    return scores, np.array(confidences).T

def scores_dataframe(all_models, all_models_names, file):
    df = pd.DataFrame()
    confidences = []
    for i, models in enumerate(all_models):
        df[all_models_names[i]], c = models_get_score(models, file)
        confidences.append(c)
    df["tokens"] = sizes
    df.set_index("tokens", inplace=True)
    return df, np.array(confidences)

def scores_dataframes(all_models, all_models_names, files):
    return [scores_dataframe(all_models, all_models_names, file) for file in files]
In [239]:
scores = scores_dataframes([svd_models, w2v_models], ["svd", "w2v"], files)
In [249]:
def plot_dataframes(dfs, names):
    for i, (df, errors) in enumerate(dfs):
        ax = df.plot(style='-', yerr=errors)
        
        low_svd = df["svd"] - errors[0][0]
        hig_svd = errors[0][1] + df["svd"]
        
        low_w2v = df["w2v"] - errors[1][0]
        hig_w2v = errors[1][1] + df["w2v"]
        
        ax.fill_between(df.index, low_svd, hig_svd, alpha=.05, color='#1f77b4')
        ax.fill_between(df.index, low_w2v, hig_w2v, alpha=.05, color='#ff7f0e')
        
        ax.set_ylim(0,1)
        ax.set_title(names[i])
        ax.set_xlabel("learning corpus tokens")
        ax.set_ylabel("pearson similarity coefficient")
        ax.grid(color='gray', linestyle='-', linewidth=0.5)
In [252]:
plot_dataframes(scores, [os.path.basename(f) for f in files])