Source code for fee.reports.global_report

import numpy as np
import pandas as pd
from ..utils import get_g
import seaborn as sns
import matplotlib.pyplot as plt

[docs]class GlobalReport(): """`GlobalReport` Class """ def __init__(self, E, g=None): """Generate a global bias report for a word embedding. This report computes the least and most biased words in an embedding and plot them. Bias by projection (direct bias) is used as the metric to compute this report. The report also plots the overall distribution of bias in the embedding `E`. Args: E (WE class object): Word embeddings object g (np.array): gender direction """ if g is None: g = get_g(E) assert len(g) == E.dim self.g = g self.E = E
[docs] def plot(self, values): """Plot the biased words. Args: values (list): list of bias by projection """ sns.distplot(values) plt.title("Distribution plot of bias by projection for all words.") plt.show()
[docs] def get_values_and_words(self): """Get the list of words in `E` sorted by bias by projection. """ dbs = np.abs(self.E.vecs.dot(self.g)) sorted_values, indices = np.sort(dbs)[::-1], np.argsort(dbs)[::-1] sorted_words = [self.E.words[i] for i in indices] return sorted_words, sorted_values
[docs] def print_df(self, sorted_values, sorted_words, n): """Pretty print the dataframe containing most and least biased words in `E`. Args: sorted_words (list): list of bias by projection for `sorted_words` sorted_words (list): list of words n (int): no. of least/most biased words to print """ most_gendered_df = pd.DataFrame() least_gendered_df = pd.DataFrame() most_gendered_df['words'] = sorted_words[:n] most_gendered_df['bias by projection'] = sorted_values[:n] least_gendered_df['words'] = sorted_words[-n:] least_gendered_df['bias by projection'] = sorted_values[-n:] print(most_gendered_df, "\n\n", least_gendered_df) return most_gendered_df, least_gendered_df
[docs] def generate(self, n=10, ret_df=False, plot=True): """Generate the global report for embedding `E` Args: n (int): No. of most/least biased words to print. """ sorted_words, sorted_values = self.get_values_and_words() most_gendered_df, least_gendered_df = self.print_df(sorted_values, sorted_words, n) if plot: self.plot(sorted_values) if ret_df: return most_gendered_df, least_gendered_df