Distributed representation for text#

1raise SystemExit("Stop right there!");
An exception has occurred, use %tb to see the full traceback.

SystemExit: Stop right there!

Importing libraries and packages#

 1# Mathematical operations and data manipulation
 2import numpy as np
 3
 4# Modelling
 5import gensim.downloader as api
 6from gensim.models import Word2Vec
 7
 8# NLP
 9import nltk
10from nltk.corpus import brown, movie_reviews
11
12from IPython.display import display, HTML
13
14display(HTML("<style>.container {width:80% !important;}</style>"))

Training embeddings#

1# Load data using the Text8Corpus utility in Gensim
2dataset = api.load("text8")

The word2vec algorithm/process is a prediction exercise, a massive “fill-in-the-blanks”.

1# For reproducibility, setting random seed as 1
2np.random.seed(1)
1model = Word2Vec(dataset)
1# Accessing the word vector/embedding for terms
2print(model.wv["animal"])
3len(model.wv["animal"])
[ 0.27228504 -0.38589182 -0.03982047  0.57194465  2.1317153   0.6118779
 -3.2943935  -1.279106   -1.480803   -0.4851454  -0.123493   -0.18274239
  1.635121    0.83649874 -0.9406745   0.6846406   1.7386141  -0.7655637
 -0.37932056 -0.3007117  -0.7148655  -1.2088175  -0.80008346 -3.13427
 -1.2619188  -3.2389088   3.3998997   0.48120168 -0.26664948 -0.80954057
  2.4452078   0.5406759   1.5318955   0.9631443   0.5058787  -1.4876282
  1.4796237  -1.5362738   0.29777908 -0.84316385 -0.3179715   0.36696172
 -0.6838597   1.5695717  -0.6970046   0.5419471   0.7158093   0.27194694
 -0.05068693  0.11243282  1.2082376   0.88686913 -0.96424097  0.7430806
  1.2636031  -0.45338708  0.89310837 -0.6195426  -2.5058024   0.621189
 -0.39454693 -0.44573182 -1.1688169   2.518913   -0.328193   -0.6137894
  0.6997081  -3.5487535  -0.65153015  1.3728606  -0.85431874  0.8286054
  0.18094862  1.1130623  -0.177708   -0.5921164   0.03566591  1.7110436
 -0.7840261   1.1224031   0.63851506 -0.4155715  -1.538735    1.8196214
 -1.2503967  -0.26837078  1.7721883  -1.5477698   2.1892536  -0.56743556
  0.5118777  -0.3046281  -0.5253251   1.1636406   1.1029975  -0.35229835
 -0.45145327  0.14134814 -2.8448613   0.7859125 ]
100
1model.wv.most_similar("animal")
[('insect', 0.748289167881012),
 ('animals', 0.7470211982727051),
 ('ants', 0.6867718696594238),
 ('aquatic', 0.6736039519309998),
 ('organism', 0.6627269983291626),
 ('feces', 0.6588298082351685),
 ('insects', 0.6584202647209167),
 ('humans', 0.653847336769104),
 ('mammal', 0.647551953792572),
 ('eating', 0.647230327129364)]
1model.wv.most_similar("happiness")
[('pleasure', 0.7531014680862427),
 ('humanity', 0.7518173456192017),
 ('goodness', 0.7395626306533813),
 ('dignity', 0.7233573794364929),
 ('perfection', 0.7214764356613159),
 ('desires', 0.7147167325019836),
 ('righteousness', 0.7142145037651062),
 ('compassion', 0.7134000062942505),
 ('satisfaction', 0.7051723003387451),
 ('conscious', 0.6937511563301086)]

Semantic regularities in word embeddings#

The most_similar() method allows for adding and subtracting vectors from each other.

1model.wv.most_similar(positive=["woman", "king"], negative=["man"], topn=5)
[('queen', 0.6923314332962036),
 ('empress', 0.6271005868911743),
 ('elizabeth', 0.6254151463508606),
 ('princess', 0.6178668141365051),
 ('prince', 0.615206241607666)]
1model.wv.most_similar(positive=["uncle", "woman"], negative=["man"], topn=5)
[('aunt', 0.798119306564331),
 ('wife', 0.7972561717033386),
 ('grandmother', 0.7968131899833679),
 ('niece', 0.7801458239555359),
 ('husband', 0.7780385613441467)]

Vectors for phrases#

1# Extracting vectors for the terms "get" and "happy"
2v1 = model.wv["get"]
3v2 = model.wv["happy"]
4# Creating a vector as the element-wise average of the two
5# vectors, (v1 + v2)/2. This is the vector for the phrase "get happy"
6res1 = (v1 + v2) / 2
1# Creating a vector for the phrase "make merry"
2v1 = model.wv["make"]
3v2 = model.wv["merry"]
4res2 = (v1 + v2) / 2
1# Checking the cosine similarity between the two
2model.wv.cosine_similarities(res1, [res2])
array([0.5615818], dtype=float32)

Effect of parameters#

skipgram vs. CBOW#

Choosing between Skip-gram and CBOW as the learning algorithm is done by setting sg = 1 for Skip-gram (the default is sg = 0 , that is, CBOW). Skip-gram predicts the context words based on the central target word. This flips the formulation of CBOW, where the context words are used to predict the target word. Try both, choose the one giving the best results.

1model = Word2Vec(dataset)
1model.wv.most_similar("oeuvre", topn=5)
[('orchestration', 0.737889289855957),
 ('bastien', 0.7042948007583618),
 ('rebuttals', 0.6919044852256775),
 ('missa', 0.6900247931480408),
 ('notebooks', 0.6863371729850769)]
1model_sg = Word2Vec(dataset, sg=1)
1model_sg.wv.most_similar("oeuvre", topn=5)
[('masterful', 0.836944043636322),
 ('orchestration', 0.8146712779998779),
 ('inklings', 0.812639057636261),
 ('irreverent', 0.8059399127960205),
 ('nutcracker', 0.803440272808075)]

Training word vectors on different datasets#

1nltk.download("brown")
2nltk.download("movie_reviews")
True
1model_brown = Word2Vec(brown.sents(), sg=1)
2model_movie = Word2Vec(movie_reviews.sents(), sg=1)
1model_brown.wv.most_similar("money", topn=5)
[('job', 0.8253815770149231),
 ('care', 0.8236523270606995),
 ('friendship', 0.8147565722465515),
 ('risk', 0.806850254535675),
 ('joy', 0.8031473159790039)]
1model_movie.wv.most_similar("money", topn=5)
[('cash', 0.7720625400543213),
 ('risk', 0.6932722926139832),
 ('record', 0.6870973706245422),
 ('ransom', 0.6857244968414307),
 ('bucks', 0.6649163961410522)]

Bias in embeddings#

1model.wv.most_similar(positive=["woman", "doctor"], negative=["man"], topn=5)
[('nurse', 0.6089574694633484),
 ('child', 0.5759322643280029),
 ('teacher', 0.5489358305931091),
 ('helen', 0.5343905687332153),
 ('psychiatrist', 0.5290003418922424)]
1model.wv.most_similar(positive=["woman", "smart"], negative=["man"], topn=5)
[('fancy', 0.561569094657898),
 ('dumb', 0.5561914443969727),
 ('jukebox', 0.5503122806549072),
 ('casual', 0.5307846069335938),
 ('pong', 0.5303421020507812)]

There definitely is bias in the resulting word vectors, but think about where the bias is coming from. It’s the underlying data that uses ‘nurse’ for females in contexts where ‘doctor’ is used for males. It is, therefore, the underlying text that contains the bias, not the algorithm.