Distributed representation for text#

raise SystemExit("Stop right there!");

An exception has occurred, use %tb to see the full traceback.

SystemExit: Stop right there!

Importing libraries and packages#

# Mathematical operations and data manipulation
import numpy as np

# Modelling
import gensim.downloader as api
from gensim.models import Word2Vec

# NLP
import nltk
from nltk.corpus import brown, movie_reviews

from IPython.display import display, HTML

display(HTML("<style>.container {width:80% !important;}</style>"))

Training embeddings#

# Load data using the Text8Corpus utility in Gensim
dataset = api.load("text8")

The word2vec algorithm/process is a prediction exercise, a massive “fill-in-the-blanks”.

# For reproducibility, setting random seed as 1
np.random.seed(1)

model = Word2Vec(dataset)

# Accessing the word vector/embedding for terms
print(model.wv["animal"])
len(model.wv["animal"])

[ 0.27228504 -0.38589182 -0.03982047  0.57194465  2.1317153   0.6118779
 -3.2943935  -1.279106   -1.480803   -0.4851454  -0.123493   -0.18274239
  1.635121    0.83649874 -0.9406745   0.6846406   1.7386141  -0.7655637
 -0.37932056 -0.3007117  -0.7148655  -1.2088175  -0.80008346 -3.13427
 -1.2619188  -3.2389088   3.3998997   0.48120168 -0.26664948 -0.80954057
  2.4452078   0.5406759   1.5318955   0.9631443   0.5058787  -1.4876282
  1.4796237  -1.5362738   0.29777908 -0.84316385 -0.3179715   0.36696172
 -0.6838597   1.5695717  -0.6970046   0.5419471   0.7158093   0.27194694
 -0.05068693  0.11243282  1.2082376   0.88686913 -0.96424097  0.7430806
  1.2636031  -0.45338708  0.89310837 -0.6195426  -2.5058024   0.621189
 -0.39454693 -0.44573182 -1.1688169   2.518913   -0.328193   -0.6137894
  0.6997081  -3.5487535  -0.65153015  1.3728606  -0.85431874  0.8286054
  0.18094862  1.1130623  -0.177708   -0.5921164   0.03566591  1.7110436
 -0.7840261   1.1224031   0.63851506 -0.4155715  -1.538735    1.8196214
 -1.2503967  -0.26837078  1.7721883  -1.5477698   2.1892536  -0.56743556
  0.5118777  -0.3046281  -0.5253251   1.1636406   1.1029975  -0.35229835
 -0.45145327  0.14134814 -2.8448613   0.7859125 ]

model.wv.most_similar("animal")

[('insect', 0.748289167881012),
 ('animals', 0.7470211982727051),
 ('ants', 0.6867718696594238),
 ('aquatic', 0.6736039519309998),
 ('organism', 0.6627269983291626),
 ('feces', 0.6588298082351685),
 ('insects', 0.6584202647209167),
 ('humans', 0.653847336769104),
 ('mammal', 0.647551953792572),
 ('eating', 0.647230327129364)]

model.wv.most_similar("happiness")

[('pleasure', 0.7531014680862427),
 ('humanity', 0.7518173456192017),
 ('goodness', 0.7395626306533813),
 ('dignity', 0.7233573794364929),
 ('perfection', 0.7214764356613159),
 ('desires', 0.7147167325019836),
 ('righteousness', 0.7142145037651062),
 ('compassion', 0.7134000062942505),
 ('satisfaction', 0.7051723003387451),
 ('conscious', 0.6937511563301086)]

Semantic regularities in word embeddings#

The most_similar() method allows for adding and subtracting vectors from each other.

model.wv.most_similar(positive=["woman", "king"], negative=["man"], topn=5)

[('queen', 0.6923314332962036),
 ('empress', 0.6271005868911743),
 ('elizabeth', 0.6254151463508606),
 ('princess', 0.6178668141365051),
 ('prince', 0.615206241607666)]

model.wv.most_similar(positive=["uncle", "woman"], negative=["man"], topn=5)

[('aunt', 0.798119306564331),
 ('wife', 0.7972561717033386),
 ('grandmother', 0.7968131899833679),
 ('niece', 0.7801458239555359),
 ('husband', 0.7780385613441467)]

Vectors for phrases#

# Extracting vectors for the terms "get" and "happy"
v1 = model.wv["get"]
v2 = model.wv["happy"]
# Creating a vector as the element-wise average of the two
# vectors, (v1 + v2)/2. This is the vector for the phrase "get happy"
res1 = (v1 + v2) / 2

# Creating a vector for the phrase "make merry"
v1 = model.wv["make"]
v2 = model.wv["merry"]
res2 = (v1 + v2) / 2

# Checking the cosine similarity between the two
model.wv.cosine_similarities(res1, [res2])

array([0.5615818], dtype=float32)

Effect of parameters#

skipgram vs. CBOW#

Choosing between Skip-gram and CBOW as the learning algorithm is done by setting sg = 1 for Skip-gram (the default is sg = 0 , that is, CBOW). Skip-gram predicts the context words based on the central target word. This flips the formulation of CBOW, where the context words are used to predict the target word. Try both, choose the one giving the best results.

model = Word2Vec(dataset)

model.wv.most_similar("oeuvre", topn=5)

[('orchestration', 0.737889289855957),
 ('bastien', 0.7042948007583618),
 ('rebuttals', 0.6919044852256775),
 ('missa', 0.6900247931480408),
 ('notebooks', 0.6863371729850769)]

model_sg = Word2Vec(dataset, sg=1)

model_sg.wv.most_similar("oeuvre", topn=5)

[('masterful', 0.836944043636322),
 ('orchestration', 0.8146712779998779),
 ('inklings', 0.812639057636261),
 ('irreverent', 0.8059399127960205),
 ('nutcracker', 0.803440272808075)]

Training word vectors on different datasets#

nltk.download("brown")
nltk.download("movie_reviews")

True

model_brown = Word2Vec(brown.sents(), sg=1)
model_movie = Word2Vec(movie_reviews.sents(), sg=1)

model_brown.wv.most_similar("money", topn=5)

[('job', 0.8253815770149231),
 ('care', 0.8236523270606995),
 ('friendship', 0.8147565722465515),
 ('risk', 0.806850254535675),
 ('joy', 0.8031473159790039)]

model_movie.wv.most_similar("money", topn=5)

[('cash', 0.7720625400543213),
 ('risk', 0.6932722926139832),
 ('record', 0.6870973706245422),
 ('ransom', 0.6857244968414307),
 ('bucks', 0.6649163961410522)]

Bias in embeddings#

model.wv.most_similar(positive=["woman", "doctor"], negative=["man"], topn=5)

[('nurse', 0.6089574694633484),
 ('child', 0.5759322643280029),
 ('teacher', 0.5489358305931091),
 ('helen', 0.5343905687332153),
 ('psychiatrist', 0.5290003418922424)]

model.wv.most_similar(positive=["woman", "smart"], negative=["man"], topn=5)

[('fancy', 0.561569094657898),
 ('dumb', 0.5561914443969727),
 ('jukebox', 0.5503122806549072),
 ('casual', 0.5307846069335938),
 ('pong', 0.5303421020507812)]

There definitely is bias in the resulting word vectors, but think about where the bias is coming from. It’s the underlying data that uses ‘nurse’ for females in contexts where ‘doctor’ is used for males. It is, therefore, the underlying text that contains the bias, not the algorithm.

Table of Contents

Books