%%javascript
IPython.OutputArea.auto_scroll_threshold = 99999;
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}


import warnings

# Initialization configurations for better visualization effects
%matplotlib notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))

try:
    # Data Cleaning / EDA Suite
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import matplotlib.animation as animation
    import matplotlib.style as style
    import seaborn as sns
    import geopandas as gpd
    import altair as alt
    import plotly.express as px
    from celluloid import Camera

    # Tools for building the dataset locally
    import os
    import glob
    import json
    import shutil
    import opendatasets as od

    # Tools for statistical analysis
    from sklearn.decomposition import PCA
    from sklearn.metrics import mean_squared_error, r2_score, normalized_mutual_info_score
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import OneHotEncoder, RobustScaler
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.mixture import GaussianMixture
    from sklearn.svm import SVR
    from sklearn.manifold import TSNE
    import patsy
    import statsmodels.api as sm
except:
    warnings.warn('Missing packages detected. pip is attempting to install all missing dependencies. After this is finished, please rerun this cell', UserWarning)
    !pip install -r requirements.txt

# disable warnings for better visualzation
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 8)


# set force_reset = True if you want to rebuild the dataset
def create_dataset(force_reset=False):
    # all the data will be stored locally under the data directory
    data_path = 'data'
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    # make sure we have 3 files in the data folder
    if len(glob.glob('data/*.csv')) != 3 or force_reset:
        try:      
            shutil.rmtree('./owid-covid-data')
            shutil.rmtree('./world-happiness-report-2021')
            shutil.rmtree('./population-by-country-2020')
        except:
            pass
        
        try:
            kaggle = json.load(open('kaggle.json'))
        except:
            print("Make sure your kaggle.json is correctly configured")
        
    # download all of the dataset
        od.download('https://covid.ourworldindata.org/data/owid-covid-data.csv')
        od.download('https://www.kaggle.com/ajaypalsinghlo/world-happiness-report-2021')
        od.download('https://www.kaggle.com/tanuprabhu/population-by-country-2020')
    
    # rename the csv files and save them into the data folder
        os.rename('./owid-covid-data.csv',
                  './data/owid-covid-data.csv')
        os.rename('./world-happiness-report-2021/world-happiness-report-2021.csv', 
                  './data/world_happiness.csv')
        os.rename('./population-by-country-2020/population_by_country_2020.csv',
                  './data/population.csv')
                  
        shutil.rmtree('./world-happiness-report-2021')
        shutil.rmtree('./population-by-country-2020')
    else:
        # if all datasets exist, skip the data preparation
        print("Use Cached Data")
    return True


# create the dataset
create_dataset()

# read all the datasets from the local data folder and save them as seperate Pandas DataFrames
vac_and_death = pd.read_csv('data/owid-covid-data.csv')
happiness = pd.read_csv('data/world_happiness.csv')
population = pd.read_csv('data/population.csv')

Use Cached Data


# select the desired columns for further analysis for each DataFrame
vac_and_death_cols = ['location', 'date', 'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', \
                      'new_vaccinations', 'new_vaccinations_smoothed','total_deaths', 'new_deaths'] 

happiness_cols = ['Country name', 'Regional indicator', 'Social support', 
                  'Healthy life expectancy', 'Freedom to make life choices', 'Generosity',
                 'Perceptions of corruption']
population_cols = ['Country (or dependency)', 'Population (2020)', 'Density (P/Km²)', 'Med. Age']

# keep the desired columns in the DataFrame.
vac_and_death = vac_and_death[vac_and_death_cols]
happiness = happiness[happiness_cols]
population = population[population_cols]

# merge the happiness and population by each country and name it df. 
df = happiness.merge(population, left_on='Country name', right_on='Country (or dependency)')

# for the vac_and_death dataset, we only keep the rows which the countries appear in df.
country_set = set(df['Country name'].values)
vac_and_death = vac_and_death[vac_and_death.location.isin(country_set)]


vac_and_death_temp = vac_and_death[['date', 'location', 'new_vaccinations_smoothed']]
onset_date = vac_and_death_temp.groupby('location')[['date', 'new_vaccinations_smoothed']]\
                            .apply(lambda df: df.dropna().date.min())
onset_date = onset_date.to_dict()

# dictionary with the key-value pairs of countries and their corresponding onset dates
# for cleaner view of this notebook, we have only displayed the first five country: date pairs
[f'{country}: {onset_date}' for country, onset_date in onset_date.items()][:5]

['Afghanistan: 2021-02-23',
 'Albania: 2021-01-11',
 'Algeria: 2021-01-30',
 'Argentina: 2020-12-30',
 'Armenia: 2021-04-01']


def check(ratio):
    #Make sure it has to be bounded by infinity
    if ratio > 100 and ratio < float('inf'):
        return True
    return False

#Apply the function to the values of new_vaccinations_smoothed on the neighbour rows to determine which two days has
#satisfy the conditions of 100 < ratio < infinity, where 100 is an arbitary number that signifies 
#a substantial increase.
if_ratio_greater_than_100 = (vac_and_death['new_vaccinations_smoothed'] 
                             / vac_and_death['new_vaccinations_smoothed'].shift(1)).apply(check)

vac_and_death['ratio_greater_than_100_p_v'] = if_ratio_greater_than_100
vac_and_death_true = vac_and_death[vac_and_death['ratio_greater_than_100_p_v'] == True]
vac_and_death_true


# pop out the country of Jamaica
onset_date.pop('Jamaica', None)
# reset the onset date for Denmark, Switzerland, Uruguay
onset_date['Denmark'] = '2020-12-27'
onset_date['Switzerland'] = '2020-12-23'
onset_date['Uruguay'] = '2021-03-01'

# merge the DataFrame bulit of the onset_date dictioanry to the vac_and_death main DateFrame by location and index
vac_and_death = vac_and_death.merge(pd.Series(onset_date).to_frame().reset_index(), \
                                    left_on='location', right_on='index')

# we only want the date here, so we have converted them into a datetime object
vac_and_death.date = pd.to_datetime(vac_and_death.date)
vac_and_death[0] = pd.to_datetime(vac_and_death[0])

# after getting the onset date for each countries, we can subtract the date from its own onset date to 
# obtain a time delta since the onset date
vac_and_death = vac_and_death.dropna(subset=['new_vaccinations_smoothed'])
vac_and_death['since_onset_by_group'] = (vac_and_death.date - vac_and_death[0]).dt.days
vac_and_death.head()


# create a new DataFrame
vac_death_agg = pd.DataFrame()

# add the country names into the DateFrame
vac_death_agg['location'] = vac_and_death[vac_and_death.since_onset_by_group \
    <= 7].groupby('location')['new_vaccinations_smoothed'].mean().index

# define a list of time intervals
time_intervals = [7, 15, 30, 60, 90, 180, 360]


# iterate through all of the time intervals and calculate the average of new vaccinations for each time interval
for time_interval in time_intervals:
    vac_death_agg[f'new_vac_{time_interval}_days_raw'] = \
    vac_and_death[vac_and_death.since_onset_by_group <= time_interval]\
    .groupby('location')['new_vaccinations_smoothed'].mean().values
    
    vac_death_agg[f'new_death_{time_interval}_days_raw'] = \
    vac_and_death[vac_and_death.since_onset_by_group <= time_interval]\
    .groupby('location')['new_deaths'].mean().values

# merge the vac_and_death_agg into the main df DataFrame so that we could do further analysis
df = df.merge(vac_death_agg, left_on='Country name', right_on='location')

# we normalized the vaccination number increased for each country by its population 
for time_interval in time_intervals:
    df[f'new_vac_{time_interval}_days'] = \
    df[f'new_vac_{time_interval}_days_raw']/df['Population (2020)']
    df = df.drop(columns=[f'new_vac_{time_interval}_days_raw'])
    
    df[f'new_death_{time_interval}_days'] = \
    df[f'new_death_{time_interval}_days_raw']/df['Population (2020)']
    df = df.drop(columns=[f'new_death_{time_interval}_days_raw'])

onset_df = pd.Series(onset_date).to_frame().reset_index()
onset_df.columns = ['index', 'onset_date']
df = df.merge(onset_df, left_on='Country name', right_on='index')
df = df.drop(['index'], axis=1)
df.head()


country_name_to_replace = {
    'United States': 'United States of America',
    'Bosnia and Herzegovina': 'Bosnia and Herz.',
    'Dominican Republic': 'Dominican Rep.',
    'North Macedonia': 'Macedonia',
}

# Countries that cannot be shown with geopandas world's map: 
# Bahrain, Malta, Singapore, Mauritius, Maldives, Comoros

new_country_lst = []
for c in df['Country name'].values:
    if c in country_name_to_replace:
        c = country_name_to_replace[c]
    new_country_lst.append(c)
df['Country name'] = new_country_lst


style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = [9.8, 6]
# Plot the countries we will analyze versus the countries we won't
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))[['name', 'geometry']]
avail_countries = world.merge(df[['Country name']], left_on='name', right_on='Country name', how='left')
avail_countries['avail'] = ~avail_countries['Country name'].isna().values
avail_countries.avail = avail_countries.avail.replace({False: 'Not Analyzed', True: 'Analyzed'})
avail_countries.plot(column='avail', legend=True, categorical=True,);
plt.title('Countries in the Scope of Our Analysis');


plt.rcParams['figure.figsize'] = [9.8, 6]
plt.rcParams.update({'font.size': 10})

f, axes = plt.subplots(2, 2)
def plot_stationary_hist(ax, var, label, bins=20):
    sns.histplot(data=df, x=var, ax=ax, bins=bins)
    ax.set_xlabel(label)
# Plot the distribution of the four happiness metrics we will use
plot_stationary_hist(axes[0][0], 'Social support', 'Social Support')
plot_stationary_hist(axes[0][1], 'Healthy life expectancy', 'Healthy Life Expectancy')
plot_stationary_hist(axes[1][0], 'Perceptions of corruption','Perceptions of Corruption')
plot_stationary_hist(axes[1][1], 'Generosity','Generosity')
f.suptitle('Overall Distribution of Various Happiness Metrics');


# https://hihayk.github.io/scale/#4/6/50/80/-51/67/20/14/1D9A6C/29/154/108/white
colors = [
    '#0A2F51',
    '#0E4D64',
    '#137177',
    '#188977',
    '#1D9A6C',
    '#39A96B',
    '#56B870',
    '#74C67A',
]
colors.reverse()
plt.rcParams['figure.figsize'] = [9.8, 6]
plt.rcParams.update({'font.size': 8})

# to visualize -inf at log scale
eps = 1e-5

f, axes = plt.subplots(2, 2)
title = 'Overall Distribution in Average New Vaccination Rate and Death Rate Across Different Time Spans'
f.suptitle(title, 
           size='medium')
camera = Camera(f)
def plot_animation_hist(ax, var=None, label=None, color=None, bins=20, manual_entry=False, data=None):
    if not manual_entry:
        sns.histplot(x=var, data=df, ax=ax, color=color, bins=bins)
    else:
        sns.histplot(x=data, ax=ax, color=color, bins=bins)
    ax.text(0, -0.15, label, transform=ax.transAxes)
    ax.set_xlabel('')
# Plot the average new vaccination rate and death across different time spans:
# 7, 15, 30, 60, 180, 360 days
vac_to_plot = [f'new_vac_{number}_days' for number in time_intervals]
death_to_plot = [f'new_death_{number}_days' for number in time_intervals]
for i in range(len(vac_to_plot)):
    plot_animation_hist(axes[0][0], vac_to_plot[i], color=colors[i],
                        label=f'Average New Vaccination Rate over {time_intervals[i]} Days')
    plot_animation_hist(axes[0][1], death_to_plot[i], color=colors[i],
                        label=f'Average New Death Rate over {time_intervals[i]} Days')
    plot_animation_hist(axes[1][0], color=colors[i], manual_entry=True, data=np.log(df[vac_to_plot[i]] + eps),
                        label=f'Average New Vaccination Rate over {time_intervals[i]} Days (Log-Scaled)')
    plot_animation_hist(axes[1][1], color=colors[i], manual_entry=True, data=np.log(df[vac_to_plot[i]] + eps),
                        label=f'Average New Death Rate over {time_intervals[i]} Days (Log-Scaled)')
    camera.snap()
animation = camera.animate(interval = 1500, repeat = True, repeat_delay = 0)
if not os.path.exists('visualizations'): os.makedirs('visualizations')
animation.save(f'visualizations/{title}.gif')
animation;

MovieWriter ffmpeg unavailable; using Pillow instead.


plt.rcParams['figure.figsize'] = [9.8, 10]
plt.rcParams.update({'font.size': 7})
# Extract the columns that we want to analyze
onset_happiness = df[['Country name', 'Regional indicator', 'Perceptions of corruption',
                      'Social support', 'Healthy life expectancy', 'Generosity', 'onset_date']]
# Calculate after how many days people from a country gets vaccinated since
# the vaccination date of a country with the earliest vaccination onset date
onset_happiness['days_since_earlist'] = (pd.to_datetime(df['onset_date']) \
                                       - pd.to_datetime(df['onset_date']).min()).dt.days
# Make two seperate sets of scatter plots. Both of them are metrics for happiness versus 
# the number of days since the earlist onset for all countries. While plots in the first
# have a linear regression line, plots in the second set are hued by different regions
f, axes = plt.subplots(4, 2)
f.suptitle('Relationship Between Promptitude of Vaccination Onset versus Various Happiness Score Metrics', 
           size='medium')
columns_to_plot = ['Perceptions of corruption', 'Social support', 'Healthy life expectancy', 'Generosity']
for i in range(len(axes)):
    sns.regplot(x='days_since_earlist', y=columns_to_plot[i], 
                data=onset_happiness, ax=axes[i][0])
    axes[i][0].set_xlabel('Number of Days since the Earlist Onset among all Countries')
    axes[i][0].set_ylabel(columns_to_plot[i].title())
    sns.scatterplot(x='days_since_earlist', y=columns_to_plot[i], 
                    data=onset_happiness, hue='Regional indicator', ax=axes[i][1])
    axes[i][1].set_xlabel('Number of Days since the Earlist Onset among all Countries')
    axes[i][1].set_ylabel(columns_to_plot[i].title())
    axes[0][1].legend(bbox_to_anchor=(1, 1), loc='lower right', ncol=2)
    if i!= 0: axes[i][1].get_legend().remove()


def trend_animation(kw, label, title=None):
    def configure_subplot(ax, x, y):
        ax.text(0.2, -0.15, x, transform=ax.transAxes)
        ax.set_xlabel('')
        ax.set_ylabel(y)

    plt.rcParams['figure.figsize'] = [9.8, 6]
    plt.rcParams.update({'font.size': 8})
    # set the timespans for plots
    iterations_to_plot = [f'new_{kw}_{number}_days' for number in time_intervals]
    # Extract the happiness metrics that we want to analyze
    columns_to_plot = ['Perceptions of corruption', 'Social support', 'Healthy life expectancy', 'Generosity']

    vac_happiness = df[iterations_to_plot + columns_to_plot]
    for num_days in iterations_to_plot:
        vac_happiness[num_days] = np.log(vac_happiness[num_days])
    # Plot all four metrics versus the seclected column across all timespans
    f, axes = plt.subplots(2, 2)
    if title is not None: f.suptitle(title, size='medium')
    camera = Camera(f)
    for i in range(len(iterations_to_plot)):
        sns.regplot(x=iterations_to_plot[i], y=columns_to_plot[0], 
                            data=vac_happiness, ax=axes[0][0], color=colors[i])
        configure_subplot(axes[0][0], 
                          f'Average New {label} Rate of {time_intervals[i]} Days', 
                          'Perceptions of Corruption')
        sns.regplot(x=iterations_to_plot[i], y=columns_to_plot[1], 
                            data=vac_happiness, ax=axes[0][1], color=colors[i])
        configure_subplot(axes[0][1], 
                          f'Average New {label} Rate of {time_intervals[i]} Days', 
                          'Social Support')
        sns.regplot(x=iterations_to_plot[i], y=columns_to_plot[2], 
                            data=vac_happiness, ax=axes[1][0], color=colors[i])
        configure_subplot(axes[1][0], 
                          f'Average New {label} Rate of {time_intervals[i]} Days', 
                          'Healthy Life Expectancy')
        sns.regplot(x=iterations_to_plot[i], y=columns_to_plot[3], 
                            data=vac_happiness, ax=axes[1][1], color=colors[i])
        configure_subplot(axes[1][1], 
                          f'Average New {label} Rate of {time_intervals[i]} Days', 
                          'Generosity')
        camera.snap()
    animation = camera.animate(interval = 1500, repeat = True,
                               repeat_delay = 0)
    if not os.path.exists('visualizations'): os.makedirs('visualizations')
    animation.save(f'visualizations/{title}.gif')
    return animation


# Plot all four metrics versus log-scaled new vaccination rate across all timespans
trend_animation('vac', 'Vaccination', 
    'Trend Between Various Happiness Metrics versus Log-Scaled Average New Vaccination Rate over Different Time Intervals')

MovieWriter ffmpeg unavailable; using Pillow instead.

<matplotlib.animation.ArtistAnimation at 0x7fbe40a523d0>


# Plot all four metrics versus log-scaled new death rate across all time spans
trend_animation('death', 'Death', 
    'Trend Between Various Happiness Metrics versus Log-Scaled Average New Death Rate over Different Time Intervals')

MovieWriter ffmpeg unavailable; using Pillow instead.

<matplotlib.animation.ArtistAnimation at 0x7fbe305163a0>


# Make a dataframe with all the stats we want to plot across different time spans
def make_vd_stats(df, time_intervals, column_kw, category_name, melt=True, 
                  include_region=False, region_kw='Country name'):
    columns = [region_kw] if include_region else []
    columns += [f'new_{column_kw}_{number}_days' for number in time_intervals]
    data = df[columns]
    renamed_columns = [f'{number} Days since Onset' for number in time_intervals]
    data.columns = ['Region'] + renamed_columns if include_region else renamed_columns
    if melt:
        data = data.melt()
        data['Type'] = category_name
    return data

vac_data = make_vd_stats(df, time_intervals, 'vac', 'Vaccination Rate')
death_data = make_vd_stats(df, time_intervals, 'death', 'Death Rate')
# Concatenate our vaccination data with death rate data and take the logorithm of
# new death rate and new vaccination rate
vac_death_data = pd.concat([vac_data, death_data])
vac_death_data['Log-Scaled Rate'] = np.log(vac_death_data['value'])
# Make two set of boxplots on the distribution of the log-scaled new death rate and 
# new vaccination rate across the indicated time intervals
f, ax = plt.subplots()
sns.boxplot(data=vac_death_data, x="variable", y="Log-Scaled Rate", ax=ax, hue='Type')
ax.set_title('Relationship Between Average New Vaccination Rate and Death Rate at Different Time Intervals')
f.show();


# Make a dataframe with new death rates across all time spans and group by regions
death_data = make_vd_stats(df, time_intervals, 'death', 'Death Rate', melt=False, 
                           include_region=True, region_kw='Regional indicator')
death_data = death_data.melt(id_vars = ['Region'])
death_data['variable'] = death_data['variable'].apply(lambda s: int(s.split()[0]))
death_data.columns = ['Region', 'Number of Days Since Onset', 'Death Rate']
death_data = death_data.groupby(['Region', 'Number of Days Since Onset']).mean().reset_index()

source = death_data
selection = alt.selection_multi(fields=["Region"], bind="legend")
# Plot the trend of new death rates versus time spans by each region
alt.Chart(source).mark_line().encode(
    x = alt.X('Number of Days Since Onset', scale=alt.Scale(domain=[7, 360])),
    y='Death Rate',
    color='Region',
    opacity=alt.condition(selection, alt.value(1), alt.value(0.1))
).properties(
   height=300, width=650, 
    title='Trend in Average New Death Rate Across Different Regions Since the Day of Vaccination Onset'
).add_selection(
   selection
)

# press legend to select lines, press shift to select multiple lines


# an arbitarally small constant that prevents some data causing weird output after applying log,
# given log(0) = DNE
eps=1e-7
# we selected our features to be the new vaccination rates and the new death rates for 7, 15, 30, 60,
# 90, 180, and 360 days.
features = df[['new_vac_7_days', 'new_death_7_days', 'new_vac_15_days',
       'new_death_15_days', 'new_vac_30_days', 'new_death_30_days',
       'new_vac_60_days', 'new_death_60_days', 'new_vac_90_days',
       'new_death_90_days', 'new_vac_180_days', 'new_death_180_days',
       'new_vac_360_days', 'new_death_360_days']]
# impute the missing values since PCA cannot handle missing values
features = features.fillna(features.mean())
# normalize the distribution
features = np.log(features + eps)
# promptitude, which is the days since the earliest onset date for vaccination in each country
features['promptitude'] = (pd.to_datetime(df.onset_date) - \
                           pd.to_datetime(df.onset_date).min()).dt.days
# extract and process the testing features
outcomes_cols = ['Social support', 'Healthy life expectancy', \
                 'Perceptions of corruption', 'Generosity']

predictor_str = ' + '.join(features.columns)

complete_df = pd.concat([features, df[outcomes_cols]], axis=1)
complete_df.columns = complete_df.columns.str.replace(' ','_')

vac_cols = [f'new_vac_{time}_days' for time in time_intervals]
death_cols = [f'new_death_{time}_days' for time in time_intervals]


# set fratures of the visulization
plt.rcParams['figure.figsize'] = [9.8, 5]
f, ax = plt.subplots()

# use a PCA to compress the dimensions of the model given the distribution is similar
# PCA plot of New Average Vaccination Rate
pca = PCA().fit(complete_df[vac_cols])
ax.plot([0] + list(np.cumsum(pca.explained_variance_ratio_)), label='New Average Vaccination Rate')
# PCA plot of New Average Death Rate
pca = PCA().fit(complete_df[death_cols])
ax.plot([0] + list(np.cumsum(pca.explained_variance_ratio_)), label='New Average Death Rate')
ax.set_xlabel('Number of Components')
ax.set_ylabel('Cumulative Explained Variance')

ax.legend()

f.suptitle('Cumulative Explained Variance on Average New Vaccination/Death Rate via PCA');
f.show()


# set new dimension for new average vaccination rate
VRD_NUM = 2
vaccination_rate_features = PCA(VRD_NUM).fit_transform(complete_df[vac_cols])

# set new dimension for new average death rate
DRD_NUM = 1
death_rate_features = PCA(DRD_NUM).fit_transform(complete_df[death_cols])
# select the predictor columns
predictor_cols = [f'VRD{i}' for i in range(1, VRD_NUM+1)] + \
                [f'DRD{i}' for i in range(1, DRD_NUM+1)] + \
                ['PROMPTITUDE']
# select the outcome columns
outcomes_cols = ['Social_support', 'Healthy_life_expectancy', \
                 'Perceptions_of_corruption', 'Generosity']
# generate a new DataFrame with newest features
features = pd.DataFrame(np.concatenate([vaccination_rate_features, 
                             death_rate_features,
                             features[['promptitude']].values,
                             complete_df[outcomes_cols].values], axis=1),
                        columns=predictor_cols+outcomes_cols)
predictor_str = ' + '.join(predictor_cols)
features.head(5)


# build an OLS model of the four features and social support
outcome, predictors = patsy.dmatrices(
    f"{outcomes_cols[0].replace(' ', '_')} ~ {predictor_str}", features)
model = sm.OLS(outcome, predictors)
# print out the summary of the OLS model
results = model.fit()
results.summary()


# build an OLS model of the four features and Healthy_life_expectancy
outcome, predictors = patsy.dmatrices(
    f"{outcomes_cols[1].replace(' ', '_')} ~ {predictor_str}", features)
model = sm.OLS(outcome, predictors)
# print out the summary of the OLS model
results = model.fit()
results.summary()


# build an OLS model of the four features and Perceptions_of_corruption
outcome, predictors = patsy.dmatrices(
    f"{outcomes_cols[2].replace(' ', '_')} ~ {predictor_str}", features)
model = sm.OLS(outcome, predictors)
# print out the summary of the OLS model
results = model.fit()
results.summary()


# build an OLS model of the four features and Generosity
outcome, predictors = patsy.dmatrices(
    f"{outcomes_cols[3].replace(' ', '_')} ~ {predictor_str}", features)
model = sm.OLS(outcome, predictors)
# print out the summary of the OLS model
results = model.fit()
results.summary()


outcome, predictors = patsy.dmatrices(
    "DRD1 ~ VRD1 + PROMPTITUDE", features)
model = sm.OLS(outcome, predictors)

results = model.fit()
results.summary()


# we have used RobustScaler() to build our model to prevent outliers from 
X, y = RobustScaler().fit_transform(features[['VRD1', 'VRD2', 'DRD1', 'PROMPTITUDE']]), \
        features.Healthy_life_expectancy

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# set our models
models = [
    SVR().fit(X_train, y_train),
    LinearRegression().fit(X_train, y_train),
    RandomForestRegressor(random_state=42).fit(X_train, y_train),
]

init_results = {str(type(model).__name__): model.score(X_test, y_test) for model in models}
init_results

{'SVR': 0.6249987361536417,
 'LinearRegression': 0.6931543731709369,
 'RandomForestRegressor': 0.7409978325189819}


latent_stats = {
    'Number_of_Components': [],
    'NMI_Score_on_Dummy': [],
    'NMI_Score_on_GMM': [],
}
for i in range(1, df['Regional indicator'].nunique()+1):
    np.random.seed(42)
    latent_stats['Number_of_Components'].append(i)
    latent_stats['NMI_Score_on_GMM'].append(
        normalized_mutual_info_score(df['Regional indicator'].values, 
                               GaussianMixture(i).fit_predict(X)),
    )
    latent_stats['NMI_Score_on_Dummy'].append(
        normalized_mutual_info_score(df['Regional indicator'].values, 
                               np.random.randint(i, size=len(y)))
    )
latent_stats = pd.DataFrame(latent_stats)


plt.rcParams['figure.figsize'] = [9.8, 5.5]
f, ax = plt.subplots()
latent_stats.set_index('Number_of_Components').plot(kind='bar', ax=ax, xlabel='Number of Components', 
                title='Normalized Mutual Information Score on Regional Indicator Inference with GMM',
                ylabel='Normalized Mutual Information Score');


np.random.seed(42)
coordinates = TSNE(n_components=2).fit_transform(X)
latent_cat = GaussianMixture(6).fit_predict(X)
f, ax = plt.subplots()
sns.scatterplot(
    x=coordinates[:,0], 
    y=coordinates[:,1], 
    ax=ax, 
    hue=latent_cat, 
    style=df['Regional indicator'].values, 
    palette='tab10',
)
ax.set_title('Visualization on Relationships Between Latent Modeling results and Actual Regional Indicator via TSNE')
ax.legend(bbox_to_anchor=(1.05, 1), ncol=2, fontsize='small');


np.random.seed(42)
z = OneHotEncoder(sparse=False).fit_transform(
    pd.DataFrame(GaussianMixture(6).fit_predict(X)), )
X_with_latent = np.concatenate([X, z], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_with_latent, y, test_size=0.33, random_state=42)

models = [
    SVR().fit(X_train, y_train),
    LinearRegression().fit(X_train, y_train),
    RandomForestRegressor(random_state=42).fit(X_train, y_train),
]

latent_results = {str(type(model).__name__): model.score(X_test, y_test) for model in models}
latent_results

{'SVR': 0.6186086444279955,
 'LinearRegression': 0.6805391872165707,
 'RandomForestRegressor': 0.7459539079240782}


plt.rcParams['figure.figsize'] = [9.8, 5.5]
f, ax = plt.subplots()
comparison = pd.concat([pd.Series(init_results), pd.Series(latent_results)], axis=1)
comparison.columns = ['Model_Trained_Without_Z', 'Model_Trained_With_Z']
comparison.plot(kind='barh', ax=ax, rot=70, title='Model Performance Comparison on Test Set');
ax.set_xlabel('R2 on Test Set');

	location	date	total_vaccinations	people_vaccinated	...	new_vaccinations_smoothed	total_deaths	new_deaths	ratio_greater_than_100_p_v
31506	Denmark	2020-12-27	6017.0	6017.0	...	859.0	1174.0	21.0	True
45268	Ghana	2021-05-25	NaN	NaN	...	7801.0	783.0	0.0	True
59429	Jamaica	2021-06-05	NaN	NaN	...	336.0	964.0	4.0	True
113812	Switzerland	2020-12-23	429.0	405.0	...	214.0	7203.0	93.0	True
125326	Uruguay	2021-03-01	18406.0	18406.0	...	9017.0	611.0	3.0	True

	Country name	Regional indicator	Social support	Healthy life expectancy	...	new_death_180_days	new_vac_360_days	new_death_360_days	onset_date
0	Finland	Western Europe	0.954	72.0	...	4.067209e-07	0.004752	3.649659e-07	2021-01-01
1	Denmark	Western Europe	0.954	72.7	...	1.412142e-06	0.004531	9.599740e-07	2020-12-27
2	Switzerland	Western Europe	0.942	74.4	...	2.380255e-06	0.004052	1.550371e-06	2020-12-23
3	Iceland	Western Europe	0.983	73.0	...	1.617216e-08	0.005384	4.736506e-08	2020-12-31
4	Netherlands	Western Europe	0.942	72.4	...	1.916123e-06	0.004669	1.321419e-06	2021-01-07

	VRD1	VRD2	DRD1	PROMPTITUDE	Social_support	Healthy_life_expectancy	Perceptions_of_corruption	Generosity
0	-1.998782	1.066470	-1.170656	29.0	0.954	72.0	0.186	-0.098
1	-1.892923	0.697582	2.399335	24.0	0.954	72.7	0.179	0.030
2	-0.564930	1.997444	4.358298	20.0	0.942	74.4	0.292	0.025
3	-2.195527	1.309696	-5.885623	28.0	0.983	73.0	0.673	0.160
4	-2.935554	0.453669	3.319475	35.0	0.942	72.4	0.338	0.175

Dep. Variable:	Social_support	R-squared:	0.549
Model:	OLS	Adj. R-squared:	0.535
Method:	Least Squares	F-statistic:	40.75
Date:	Wed, 08 Dec 2021	Prob (F-statistic):	2.61e-22
Time:	20:29:06	Log-Likelihood:	157.85
No. Observations:	139	AIC:	-305.7
Df Residuals:	134	BIC:	-291.0
Df Model:	4
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	0.8328	0.017	49.556	0.000	0.800	0.866
VRD1	-0.0123	0.003	-4.819	0.000	-0.017	-0.007
VRD2	0.0282	0.006	5.122	0.000	0.017	0.039
DRD1	0.0055	0.002	2.682	0.008	0.001	0.010
PROMPTITUDE	-0.0002	0.000	-1.138	0.257	-0.001	0.000

Taming COVID-19 Statistics to Reflect Happiness Score Metrics¶

Permissions¶

Overview¶

Names¶

Research Question¶

Background & Prior Work¶

Hypothesis¶

Dataset(s)¶

Setup¶

Data Cleaning¶

Data Analysis & Results¶

EDA¶

Statistical Modeling¶

Latent Variable Modeling¶

Ethics & Privacy¶

Conclusion & Discussion¶

Team Contributions¶

Appendix: Team Expectations¶

Appendix: Project Timeline¶

	location	date	total_vaccinations	people_vaccinated	...	ratio_greater_than_100_p_v	index	0	since_onset_by_group
365	Afghanistan	2021-02-23	NaN	NaN	...	False	Afghanistan	2021-02-23	0
366	Afghanistan	2021-02-24	NaN	NaN	...	False	Afghanistan	2021-02-23	1
367	Afghanistan	2021-02-25	NaN	NaN	...	False	Afghanistan	2021-02-23	2
368	Afghanistan	2021-02-26	NaN	NaN	...	False	Afghanistan	2021-02-23	3
369	Afghanistan	2021-02-27	NaN	NaN	...	False	Afghanistan	2021-02-23	4

Omnibus:	24.159	Durbin-Watson:	1.684
Prob(Omnibus):	0.000	Jarque-Bera (JB):	33.157
Skew:	-0.946	Prob(JB):	6.31e-08
Kurtosis:	4.465	Cond. No.	224.

Omnibus:	2.502	Durbin-Watson:	1.519
Prob(Omnibus):	0.286	Jarque-Bera (JB):	2.406
Skew:	0.026	Prob(JB):	0.300
Kurtosis:	3.642	Cond. No.	224.

Omnibus:	37.844	Durbin-Watson:	1.247
Prob(Omnibus):	0.000	Jarque-Bera (JB):	58.867
Skew:	-1.399	Prob(JB):	1.65e-13
Kurtosis:	4.526	Cond. No.	224.

Omnibus:	31.182	Durbin-Watson:	1.822
Prob(Omnibus):	0.000	Jarque-Bera (JB):	51.748
Skew:	1.074	Prob(JB):	5.80e-12
Kurtosis:	5.080	Cond. No.	224.

Omnibus:	4.281	Durbin-Watson:	2.078
Prob(Omnibus):	0.118	Jarque-Bera (JB):	2.685
Skew:	-0.134	Prob(JB):	0.261
Kurtosis:	2.374	Cond. No.	170.

Meeting Date	Meeting Time	Completed Before Meeting	Discuss at Meeting
10/20	5 PM	Read & Think about COGS 108 expectations; brainstorm topics/questions	Determine best form of communication; Discuss and decide on final project topic; discuss hypothesis; begin background research
10/21	5 PM	Do background research on topic	Discuss ideal dataset(s) and ethics; draft project proposal
10/23	10 PM	Edit, finalize, and submit proposal; Search for datasets	Discuss Wrangling and possible analytical approaches; Assign group members to lead each specific part
11/15	6 PM	Import & Wrangle Data; EDA	Review/Edit wrangling/EDA; Discuss Analysis Plan; Submit Checkpoint #1: Data*
11/19	5 PM	Finalize wrangling/EDA; Begin Analysis	Discuss/edit Analysis; Submit Checkpoint #2: EDA*
11/30	5 PM	Complete analysis; Draft results/conclusion/discussion	Discuss/edit full project
12/1	5PM	Finalize project	Turn in Final Project & Group Project Surveys

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	66.3090	0.801	82.762	0.000	64.724	67.894
VRD1	-0.8273	0.121	-6.815	0.000	-1.067	-0.587
VRD2	1.8179	0.263	6.923	0.000	1.299	2.337
DRD1	0.2651	0.098	2.692	0.008	0.070	0.460
PROMPTITUDE	-0.0176	0.010	-1.733	0.085	-0.038	0.002

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	0.7208	0.037	19.622	0.000	0.648	0.793
VRD1	0.0147	0.006	2.643	0.009	0.004	0.026
VRD2	-0.0284	0.012	-2.362	0.020	-0.052	-0.005
DRD1	0.0137	0.005	3.028	0.003	0.005	0.023
PROMPTITUDE	7.031e-05	0.000	0.151	0.880	-0.001	0.001

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-0.0323	0.031	-1.057	0.292	-0.093	0.028
VRD1	-0.0072	0.005	-1.557	0.122	-0.016	0.002
VRD2	-0.0023	0.010	-0.225	0.822	-0.022	0.018
DRD1	-0.0123	0.004	-3.262	0.001	-0.020	-0.005
PROMPTITUDE	0.0003	0.000	0.705	0.482	-0.000	0.001

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	2.6590	0.543	4.898	0.000	1.585	3.732
VRD1	-0.2104	0.096	-2.200	0.029	-0.400	-0.021
PROMPTITUDE	-0.0368	0.006	-5.727	0.000	-0.050	-0.024