Today we will make the first chart from the article The unique US failure to control the virus

import pandas as pd
import altair as alt
from functools import wraps
import datetime as dt

alt.renderers.set_embed_options(actions=False)

population_uri = 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv?raw=true'
deaths_ts_uri = 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv?raw=true'
gdp_current_us_dollars = 'https://gist.githubusercontent.com/armsp/58b43f28b4bf880f3874db80630dec44/raw/959a34a1797b0e3fdc860a6ef0057c62ee898dd7/gdp.csv'
deaths_ts = pd.read_csv(deaths_ts_uri)
deaths_ts.head()
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 10/16/20 10/17/20 10/18/20 10/19/20 10/20/20 10/21/20 10/22/20 10/23/20 10/24/20 10/25/20
0 NaN Afghanistan 33.93911 67.709953 0 0 0 0 0 0 ... 1485 1488 1492 1497 1499 1501 1505 1507 1511 1514
1 NaN Albania 41.15330 20.168300 0 0 0 0 0 0 ... 443 448 451 454 458 462 465 469 473 477
2 NaN Algeria 28.03390 1.659600 0 0 0 0 0 0 ... 1841 1846 1856 1865 1873 1880 1888 1897 1907 1914
3 NaN Andorra 42.50630 1.521800 0 0 0 0 0 0 ... 59 59 59 62 62 63 63 69 69 69
4 NaN Angola -11.20270 17.873900 0 0 0 0 0 0 ... 234 241 247 248 251 255 260 265 267 268

5 rows × 282 columns

gdp_us = pd.read_csv(gdp_current_us_dollars)
gdp_us.head()
Series Name Series Code Country Name Country Code 2019 [YR2019]
0 GDP per capita (current US$) NY.GDP.PCAP.CD Afghanistan AFG 502.115486913067
1 GDP per capita (current US$) NY.GDP.PCAP.CD Albania ALB 5352.85741103671
2 GDP per capita (current US$) NY.GDP.PCAP.CD Algeria DZA 3948.34327892571
3 GDP per capita (current US$) NY.GDP.PCAP.CD American Samoa ASM ..
4 GDP per capita (current US$) NY.GDP.PCAP.CD Andorra AND 40886.3911648431
population = pd.read_csv(population_uri)
population.head()
UID iso2 iso3 code3 FIPS Admin2 Province_State Country_Region Lat Long_ Combined_Key Population
0 4 AF AFG 4.0 NaN NaN NaN Afghanistan 33.93911 67.709953 Afghanistan 38928341.0
1 8 AL ALB 8.0 NaN NaN NaN Albania 41.15330 20.168300 Albania 2877800.0
2 12 DZ DZA 12.0 NaN NaN NaN Algeria 28.03390 1.659600 Algeria 43851043.0
3 20 AD AND 20.0 NaN NaN NaN Andorra 42.50630 1.521800 Andorra 77265.0
4 24 AO AGO 24.0 NaN NaN NaN Angola -11.20270 17.873900 Angola 32866268.0
def log_step(func):
    @wraps(func)
    def wrapper(dataf, *args, **kwargs):
        """timing or logging etc"""
        start = dt.datetime.now()
        output = func(dataf, *args, **kwargs)
        end = dt.datetime.now()
        print(f"After function {func.__name__} ran, shape of dataframe is - {output.shape}, execution time is - {end-start}")
        return output
    return wrapper

@log_step
def start_pipeline(dataf):
    return dataf.copy()

@log_step
def remove_cols(dataf, *arg, **kwargs):
    #print(list(arg))
    result = dataf.drop(columns=list(arg))
    return result 

@log_step
def remove_null(dataf):
    return dataf.dropna()

def rename_cols(dataf, *arg, **kwargs):
    """Rename column names of raw dataframes to something digestable and that looks better in visualization and does not have spaces in between cause altair does not like that"""
    result = dataf.rename(columns=kwargs)
    return result

@log_step
def filter_rows(dataf, which, **kwargs):
    if which == 'gdp':
        result = dataf[dataf['current_us'] != '..']
        return result
    elif which == 'pop':
        result = dataf[pd.isnull(dataf['Province_State'])]
        return result

def set_dtypes(dataf):
    """set the datatypes of columns"""
    # can use data.assign(col = lambda d: pd.to_datetime(d['col'])) or col = pd.to_datetime(d['col'])
    dataf['current_us'] = dataf['current_us'].astype(float)
    return dataf

# def remove_outliers(dataf):
#     """remove outliers"""
#     return dataf

# def add_features(dataf):
#     return dataf

@log_step
def clean(dataf):
    agg_deaths = dataf.groupby('Country/Region').sum().reset_index()
    agg_deaths = agg_deaths[agg_deaths['Country/Region'].isin(pop_w_gdp['Country/Region'])].set_index('Country/Region')
    result = agg_deaths.T.reset_index().rename_axis(None, axis=1).rename(columns={'index': 'Date'})
    result['Date'] = pd.to_datetime(result['Date'], format="%m/%d/%y")
    result = result[result['Date'] < '8/5/20']

    #convert cumulative deaths to daily deaths per million
    for col in result:
        if col != 'Date':
            result[col] = result[col].diff()
            result[col] = (result[col]/int(countries_population[countries_population['Country/Region'] == col]['Population']))*1000000
    return result
gdp = (gdp_us
       .pipe(start_pipeline)
       .pipe(remove_null)
       .pipe(remove_cols, *['Series Name', 'Series Code'])
       .pipe(rename_cols, **{'2019 [YR2019]': 'current_us'})
       .pipe(filter_rows, 'gdp')
       .pipe(set_dtypes))

countries_population = (population
       .pipe(start_pipeline)
       .pipe(filter_rows, 'pop')
       .pipe(remove_cols, *['UID', 'iso2', 'code3', 'FIPS', 'Admin2', 'Province_State', 'Lat', 'Long_', 'Combined_Key'])
       .pipe(rename_cols, **{'iso3': 'Country Code', 'Country_Region':'Country/Region'}))

# Combining population with GDP
pop_w_gdp = countries_population.merge(gdp, how='inner', on='Country Code')

# Filter for only wealthy countries i.e GDP > 25000 USD and population > 10 million
pop_w_gdp = pop_w_gdp[(pop_w_gdp['current_us'] > 25000) & (pop_w_gdp['Population'] > 10000000)]

# Making daily deaths per million data for plotting
plot_data = (deaths_ts
             .pipe(start_pipeline)
             .pipe(remove_cols, *['Province/State', 'Lat', 'Long'])
             .pipe(clean)
             .pipe(remove_null)
            )

plot_data.head()
After function start_pipeline ran, shape of dataframe is - (269, 5), execution time is - 0:00:00.000376
After function remove_null ran, shape of dataframe is - (264, 5), execution time is - 0:00:00.182202
After function remove_cols ran, shape of dataframe is - (264, 3), execution time is - 0:00:00.073348
After function filter_rows ran, shape of dataframe is - (222, 3), execution time is - 0:00:00.005289
After function start_pipeline ran, shape of dataframe is - (4154, 12), execution time is - 0:00:00.006278
After function filter_rows ran, shape of dataframe is - (189, 12), execution time is - 0:00:00.012600
After function remove_cols ran, shape of dataframe is - (189, 3), execution time is - 0:00:00.005748
After function start_pipeline ran, shape of dataframe is - (267, 282), execution time is - 0:00:00.000790
After function remove_cols ran, shape of dataframe is - (267, 279), execution time is - 0:00:00.003718
After function clean ran, shape of dataframe is - (196, 14), execution time is - 0:00:00.212112
After function remove_null ran, shape of dataframe is - (195, 14), execution time is - 0:00:00.004697
Date Australia Belgium Canada France Germany Italy Japan Korea, South Netherlands Spain Sweden US United Kingdom
1 2020-01-23 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 2020-01-24 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 2020-01-25 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 2020-01-26 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 2020-01-27 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

This is an interactive plot. Hover over the plot to see lines getting highlighted.