Source code for fee.reports.global_report
import numpy as np
import pandas as pd
from ..utils import get_g
import seaborn as sns
import matplotlib.pyplot as plt
[docs]class GlobalReport():
"""`GlobalReport` Class
"""
def __init__(self, E, g=None):
"""Generate a global bias report for a word embedding. This
report computes the least and most biased words in an embedding
and plot them. Bias by projection (direct bias) is used as the
metric to compute this report. The report also plots the overall
distribution of bias in the embedding `E`.
Args:
E (WE class object): Word embeddings object
g (np.array): gender direction
"""
if g is None:
g = get_g(E)
assert len(g) == E.dim
self.g = g
self.E = E
[docs] def plot(self, values):
"""Plot the biased words.
Args:
values (list): list of bias by projection
"""
sns.distplot(values)
plt.title("Distribution plot of bias by projection for all words.")
plt.show()
[docs] def get_values_and_words(self):
"""Get the list of words in `E` sorted by bias by projection.
"""
dbs = np.abs(self.E.vecs.dot(self.g))
sorted_values, indices = np.sort(dbs)[::-1], np.argsort(dbs)[::-1]
sorted_words = [self.E.words[i] for i in indices]
return sorted_words, sorted_values
[docs] def print_df(self, sorted_values, sorted_words, n):
"""Pretty print the dataframe containing most and least biased
words in `E`.
Args:
sorted_words (list): list of bias by projection for
`sorted_words`
sorted_words (list): list of words
n (int): no. of least/most biased words to print
"""
most_gendered_df = pd.DataFrame()
least_gendered_df = pd.DataFrame()
most_gendered_df['words'] = sorted_words[:n]
most_gendered_df['bias by projection'] = sorted_values[:n]
least_gendered_df['words'] = sorted_words[-n:]
least_gendered_df['bias by projection'] = sorted_values[-n:]
print(most_gendered_df, "\n\n", least_gendered_df)
return most_gendered_df, least_gendered_df
[docs] def generate(self, n=10, ret_df=False, plot=True):
"""Generate the global report for embedding `E`
Args:
n (int): No. of most/least biased words to print.
"""
sorted_words, sorted_values = self.get_values_and_words()
most_gendered_df, least_gendered_df = self.print_df(sorted_values, sorted_words, n)
if plot:
self.plot(sorted_values)
if ret_df:
return most_gendered_df, least_gendered_df