Compare Ben Murrell and Trevor Bedford growth rate estimatesΒΆ
In [1]:
# this cell is tagged parameters for `snakemake` parameterization
murrell_rates_csv = None
bedford_rates_url = None
pango_json = None
first_date_str = None
In [2]:
# Parameters
murrell_rates_csv = "MultinomialLogisticGrowth/model_fits/rates.csv"
pango_json = "results/compare_natural/pango-consensus-sequences_summary.json"
bedford_rates_url = "https://data.nextstrain.org/files/workflows/forecasts-ncov/gisaid/pango_lineages/global/mlr/2023-10-02_results.json"
first_date_str = "2023-01-01"
In [3]:
import gzip
import json
import urllib.request
import altair as alt
import pandas as pd
Read Bedford growth rates:
In [4]:
with urllib.request.urlopen(bedford_rates_url) as url:
bedford_growth_d = json.loads(gzip.decompress(url.read()))
# convert to data frame
bedford_growth = (
pd.DataFrame(bedford_growth_d["data"])
.query("location == 'hierarchical'")
.query("site == 'ga'")
.pivot_table(index="variant", values="value", columns="ps")
.reset_index(names="clade")
.rename(columns={"median": "Bedford clade growth"})
[["clade", "Bedford clade growth"]]
.query("clade != 'other'")
)
assert len(bedford_growth) == bedford_growth["clade"].nunique()
print(f"Read Bedford growth data for {len(bedford_growth)} clades")
Read Bedford growth data for 137 clades
Read Murrell growth rates:
In [5]:
murrell_growth = pd.read_csv(murrell_rates_csv).rename(
columns={"pango": "clade", "R": "Murrell clade growth"}
)
print(f"Read Murrell growth data for {len(murrell_growth)} clades")
Read Murrell growth data for 990 clades
Merge the growth rates and add clade designation dates:
In [6]:
with open(pango_json) as f:
clade_dates = (
pd.Series({clade: d["designationDate"] for clade, d in json.load(f).items()})
.rename("date")
.rename_axis("clade")
.reset_index()
.assign(date=lambda x: pd.to_datetime(x["date"]))
)
first_date = pd.to_datetime(first_date_str)
growth = (
bedford_growth
.merge(murrell_growth, validate="one_to_one", on="clade")
.merge(clade_dates, validate="one_to_one", on="clade")
.query(f"date >= @first_date")
)
print(f"Growth rates for both Murrell and Bedford for {len(growth)} clades")
corr = growth["Murrell clade growth"].corr(growth["Bedford clade growth"])
print(f"The Pearson correlation is {corr:.2f}")
Growth rates for both Murrell and Bedford for 124 clades The Pearson correlation is 0.90
Plot the correlation among growth rates:
In [7]:
growth_chart = (
alt.Chart(growth)
.encode(
alt.X("Murrell clade growth", scale=alt.Scale(zero=False, nice=False, padding=8)),
alt.Y("Bedford clade growth", scale=alt.Scale(zero=False, nice=False, padding=8)),
tooltip=[
alt.Tooltip("Murrell clade growth", format=".3g"),
alt.Tooltip("Bedford clade growth", format=".3g"),
"clade",
"date",
],
)
.mark_circle(size=60, opacity=0.5, stroke="black", strokeWidth=0.5)
.configure_axis(grid=False)
.properties(
width=200,
height=200,
title=alt.TitleParams(
"correlation of clade growth estimates",
subtitle=[
f"Pearson R = {corr:.2f}",
f"Showing clades designated on or after {first_date_str}",
],
dy=-8,
)
)
)
growth_chart
Out[7]:
In [ ]: