Source code for fee.embedding.loader

import zipfile
import os
# from tqdl import download
from tqdm import tqdm
import re
import gc
import numpy as np
import codecs
import gensim.downloader as api
from gensim.test.utils import get_tmpfile

         
[docs]class WE(): """The Word embedding class. The main class that facilitates the word embedding structure. Attributes ---------- dim (int): Dimension of embedding vecs (np.array): """ def __init__(self): """ Initialize WE object. """ # self.downloader = Downloader() self.desc = "Word embedding loader for "
[docs] def fname_to_format(self, fname): """Get embedding format from file name. Format can usually be extracted from the filename extension. We currently support the loading of embeddings in binary (.bin), text (.txt) and numpy format (.npy). Args: fname (str): file name Return: format (str): format (txt, bin or npy) """ if fname is None: raise "fname can't be None" return None if fname.endswith('.txt'): format = 'txt' elif fname.endswith('.bin'): format = 'bin' else: format = 'npy' return format
[docs] def get_gensim_word_vecs(self, model): """ Loading word and vecs using gensim scripts. Args: model (gensim object): Model for accessing all the words in vocab, and their vectors. """ words = sorted([w for w in model.vocab], key=lambda w: model.vocab[w].index) vecs = np.array([model[w] for w in words]) return words, vecs
def _load(self, fname, format, dim = 300): """Internal load function. There shall be no exceptions in this function. Verify everything beforehand. Loads word embedding at location `fname` on disk. Args: fname (str): Path to the embedding file on disk. format (str): Format of word embedding. Following are the supported formats: - binary - text - numpy array dim (int): The dimension of embedding vectors. Return: words (list): List of vocabulary words. vecs (np.array): Word vectors of size (self.n, self.dim) """ vecs = [] words = [] if format is None: format = self.fname_to_format(fname) if format == 'bin': import gensim.models model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True) words, vecs = self.get_gensim_word_vecs(model) elif format == 'txt': with open(fname, "r") as f: lines = f.readlines() for line in lines: tokens = line.split() v = np.array([float(x) for x in tokens[-dim:]]) w = "_".join([str(x) for x in tokens[:-dim]]) if len(v) != dim: print(f"Weird line: {tokens} | {len(v)}") continue words.append(w) vecs.append(v) else: with codecs.open(fname + '.vocab', 'r', 'utf-8') as f_embed: words = [line.strip() for line in f_embed] vecs = np.load(fname + '.wv.npy') self.n, self.dim = vecs.shape self.desc = f"File: {fname}\tFormat: {format}\t" \ f"#Words: {self.n}\tDimension: {self.dim}" return words, vecs
[docs] def load(self, fname=None, format=None, ename=None, normalize=False, dim = 300): """Load word embedding from filename or embedding name. Loads word embeddings from either filename `fname` or the embedding name `ename`. Following formats are supported: - bin: Binary format, load through gensim. - txt: Text w2v or GloVe format. - npy: Numpy format. `fname.wv.npy` contans the numpy vector while `fname.vocab` contains the vocabulary list. All Gensim pre-trained embeddings are integrated for easy access via `ename`. `ename` are same as the gensim conventions. Example: ``` we = WE() E = we.load('glove6B.txt', dim = 300) ``` ``` we = WE() E = we.load(ename = 'glove-wiki-gigaword-50') ``` Args: fname (str): Path to the embedding file on disk. format (str): Format of word embedding. Following are the supported formats: - binary - text - numpy array ename (str): Name of embedding. This will download embedding using the `Downloader` class. In case both ename and fname are provided, ename is given priority. normalize (bool): Normalize word vectors or not. dim (int): The dimension of embedding vectors. Default dimension is 300 Return: self (WE object): Return self, the word embedding object. """ if ename is not None: model = api.load(ename) words, vecs = self.get_gensim_word_vecs(model) else: words, vecs = self._load(fname, format, dim) self.words = words self.vecs = vecs self.reindex() self.normalized = normalize if normalize: self.normalize() return self
[docs] def reindex(self): """Reindex word vectors. """ self.index = {w: i for i, w in enumerate(self.words)} self.n, self.dim = self.vecs.shape assert self.n == len(self.words) == len(self.index)
[docs] def v(self, word): """Access vector for a word Returns the `self.dim` dimensional vector for the word `word`. Example: E = WE().load('glove') test = E.v('test') Args: word (str): Word to access vector of. Return: vec (np.array): `self.dim` dimension vector for `word`. """ vec = self.vecs[self.index[word]] return vec
[docs] def normalize(self): """Normalize word embeddings. Normaliation is done as follows: \vec{v}_{norm} := \vec{v}/|\vec{v}| where |\vec{v}| is the L2 norm of \vec{v} """ self.vecs /= np.linalg.norm(self.vecs, axis=1)[:, np.newaxis] self.reindex() self.desc += "\tNormalized: True" self.normalized = True
def __repr__(self): """Class `__repr__` object for pretty informational print. """ return self.desc
# if __name__ == "__main__": # E = WE().load(ename = "glove-wiki-gigaword-100", normalize=True) # print(E.v('dog')) # print(E)