Compare Ben Murrell and Trevor Bedford growth rate estimates¶

In [1]:
# this cell is tagged parameters for `snakemake` parameterization
murrell_rates_csv = None
bedford_rates_url = None
pango_json = None
first_date_str = None
In [2]:
# Parameters
murrell_rates_csv = "MultinomialLogisticGrowth/model_fits/rates.csv"
pango_json = "results/compare_natural/pango-consensus-sequences_summary.json"
bedford_rates_url = "https://data.nextstrain.org/files/workflows/forecasts-ncov/gisaid/pango_lineages/global/mlr/2023-10-02_results.json"
first_date_str = "2023-01-01"
In [3]:
import gzip
import json
import urllib.request

import altair as alt

import pandas as pd

Read Bedford growth rates:

In [4]:
with urllib.request.urlopen(bedford_rates_url) as url:
    bedford_growth_d = json.loads(gzip.decompress(url.read()))

# convert to data frame
bedford_growth = (
    pd.DataFrame(bedford_growth_d["data"])
    .query("location == 'hierarchical'")
    .query("site == 'ga'")
    .pivot_table(index="variant", values="value", columns="ps")
    .reset_index(names="clade")
    .rename(columns={"median": "Bedford clade growth"})
    [["clade", "Bedford clade growth"]]
    .query("clade != 'other'")
)
assert len(bedford_growth) == bedford_growth["clade"].nunique()

print(f"Read Bedford growth data for {len(bedford_growth)} clades")
Read Bedford growth data for 137 clades

Read Murrell growth rates:

In [5]:
murrell_growth = pd.read_csv(murrell_rates_csv).rename(
    columns={"pango": "clade", "R": "Murrell clade growth"}
)

print(f"Read Murrell growth data for {len(murrell_growth)} clades")
Read Murrell growth data for 990 clades

Merge the growth rates and add clade designation dates:

In [6]:
with open(pango_json) as f:
    clade_dates = (
        pd.Series({clade: d["designationDate"] for clade, d in json.load(f).items()})
        .rename("date")
        .rename_axis("clade")
        .reset_index()
        .assign(date=lambda x: pd.to_datetime(x["date"]))
    )

first_date = pd.to_datetime(first_date_str)
growth = (
    bedford_growth
    .merge(murrell_growth, validate="one_to_one", on="clade")
    .merge(clade_dates, validate="one_to_one", on="clade")
    .query(f"date >= @first_date")
)

print(f"Growth rates for both Murrell and Bedford for {len(growth)} clades")

corr = growth["Murrell clade growth"].corr(growth["Bedford clade growth"])
print(f"The Pearson correlation is {corr:.2f}")
Growth rates for both Murrell and Bedford for 124 clades
The Pearson correlation is 0.90

Plot the correlation among growth rates:

In [7]:
growth_chart = (
    alt.Chart(growth)
    .encode(
        alt.X("Murrell clade growth", scale=alt.Scale(zero=False, nice=False, padding=8)),
        alt.Y("Bedford clade growth", scale=alt.Scale(zero=False, nice=False, padding=8)),
        tooltip=[
            alt.Tooltip("Murrell clade growth", format=".3g"),
            alt.Tooltip("Bedford clade growth", format=".3g"),
            "clade",
            "date",
        ],
    )
    .mark_circle(size=60, opacity=0.5, stroke="black", strokeWidth=0.5)
    .configure_axis(grid=False)
    .properties(
        width=200,
        height=200,
        title=alt.TitleParams(
            "correlation of clade growth estimates",
            subtitle=[
                f"Pearson R = {corr:.2f}",
                f"Showing clades designated on or after {first_date_str}",
            ],
            dy=-8,
        )
    )
)

growth_chart
Out[7]:
In [ ]: