Compare functional effects of mutations to other data¶

This notebook compares the functional effects of mutations measured in the deep mutational scanning (on the latent phenotype) to several other sources of data:

The effects of mutations on expression and pre-fusion stability in a region of S2 as measured by Tan, …, Wu.
The effects of mutations on NTD expression as measured by Ouyang, …, Wu.
The effects of mutations on ACE2 affinity in the RBD as measured by Starr, …, Bloom
The number of counts of mutations in the UShER tree compared to that expected from the underlying mutation rate, as computed at https://github.com/jbloomlab/SARS2-mut-rates

First, import Python modules:

[1]:

import itertools
import os
import warnings

import altair as alt

import numpy

import pandas as pd

import scipy

import yaml

_ = alt.data_transformers.disable_max_rows()

warnings.filterwarnings("ignore", category=FutureWarning)  # altair deprecation warning

[2]:

# this cell is tagged parameters and will be parameterized by `papermill`
clade = "all"

[3]:

# Parameters
clade = "21K"

Read the configuration:

[4]:

# to run interactively, need the following
# os.chdir("../")

with open("config.yaml") as f:
    config = yaml.safe_load(f)

Get the data to compare¶

Read the Tan et al S2 prefusion data:

[5]:

# rename columns in this way
prefusion_tan_columns = {
    "mut": "mutation",
    "exp_score": "S2 expression score (Tan et al)",
    "fus_score": "S2 fusion score (Tan et al)",
}

# read prefusion data
print(f"Reading data from {config['prefusion_Tan_excel']}")
prefusion_tan = (
    pd.read_excel(config["prefusion_Tan_excel"])
    .query("mut_class == 'missense'")
    [prefusion_tan_columns.keys()]
    .rename(columns=prefusion_tan_columns)
)


prefusion_tan

Reading data from https://www.biorxiv.org/content/biorxiv/early/2022/09/26/2022.09.24.509341/DC2/embed/media-2.xlsx

[5]:

	mutation	S2 expression score (Tan et al)	S2 fusion score (Tan et al)
0	T883C	0.039380	-0.075710
1	T883E	1.513755	0.696441
2	T883D	0.494139	1.346548
3	T883W	1.200064	1.473806
4	T883P	0.987880	0.458952
...	...	...	...
2881	L1034T	0.917240	1.004678
2882	L1034Y	0.289642	1.754473
2883	L1034I	0.699406	1.420713
2884	L1034F	0.774203	-0.490190
2885	L1034A	0.095609	0.576563

2886 rows × 3 columns

Read Ouyang et al NTD data:

[6]:

ntd_ouyang_columns = {
    "mut": "mutation",
    "Exp_score": "NTD expression score (Ouyang et al)",
}

print(f"Reading data from {config['ntd_Ouyang']}")
ntd_ouyang = (
    pd.read_csv(config["ntd_Ouyang"], sep="\t")
    .query("mut_class == 'missense'")
    .query("avg_total_freq >= 0.000075")  # QC cutoff in paper
    [ntd_ouyang_columns.keys()]
    .rename(columns=ntd_ouyang_columns)
    .reset_index(drop=True)
)

ntd_ouyang

Reading data from https://raw.githubusercontent.com/nicwulab/SARS-CoV-2_NTD_DMS/main/result/NTD_DMS_scores.tsv

[6]:

	mutation	NTD expression score (Ouyang et al)
0	Q14V	0.062160
1	C15S	0.389341
2	V16E	0.369432
3	V16D	0.595039
4	V16R	0.815205
...	...	...
3750	C301V	0.462155
3751	C301L	0.517864
3752	C301F	0.462618
3753	C301Y	0.965599
3754	C301W	0.892456

3755 rows × 2 columns

Read the Starr et al RBD deep mutational scanning:

[7]:

# rename columns in this way
rbd_dms_starr_columns = {
    "mutation": "mutation",
    "delta_bind": "RBD ACE2 affinity (Starr et al)",
    "delta_expr": "RBD expression (Starr et al)",
}

# read the data
target = config["rbd_dms_Starr_target"]
print(f"Reading data from {config['rbd_dms_Starr']} for target {target}")
rbd_dms_starr = (
    pd.read_csv(config["rbd_dms_Starr"])
    .query("target == @target")
    [rbd_dms_starr_columns.keys()]
    .rename(columns=rbd_dms_starr_columns)
)

rbd_dms_starr

Reading data from https://media.githubusercontent.com/media/jbloomlab/SARS-CoV-2-RBD_DMS_Omicron/f76ba2b2bec18ede9fa9da18c9ccc52389b1ba3a/results/final_variant_scores/final_variant_scores.csv for target Omicron_BA1

[7]:

	mutation	RBD ACE2 affinity (Starr et al)	RBD expression (Starr et al)
0	N331A	-0.24400	-0.83011
1	N331C	-0.68319	-1.33642
2	N331D	-0.22267	-0.54616
3	N331E	-0.21514	-0.49162
4	N331F	-0.40294	-1.18490
...	...	...	...
4015	T531S	-0.01797	0.07250
4016	T531T	0.00000	0.00000
4017	T531V	0.02328	-0.05500
4018	T531W	-0.00754	-0.10566
4019	T531Y	0.02979	-0.10553

4020 rows × 3 columns

Read the mutation effects on the observed phenotype as measured in the current project. Use reference site numbering and only get amino-acid mutations not involving gaps or stop codons:

[8]:

times_seen = config["muteffects_plot_kwargs"]["addtl_slider_stats"]["times_seen"]

print(f"Reading muteffects from {config['muteffects_observed']}")
print(f"Only keeping those with times_seen >= {times_seen}")

muteffects = (
    pd.read_csv(config["muteffects_observed"])
    .query("wildtype != mutant")
    .query("(wildtype not in ['-', '*']) and (mutant not in ['-', '*'])")
    .query("times_seen >= @times_seen")
    .assign(
        mutation=lambda x: x["wildtype"] + x["reference_site"].astype(str) + x["mutant"],
    )
    .rename(columns={"effect": "effect in DMS (current study)"})
    .reset_index(drop=True)
    [["mutation", "effect in DMS (current study)", "times_seen"]]
)

muteffects

Reading muteffects from results/muteffects_functional/muteffects_observed.csv
Only keeping those with times_seen >= 3

[8]:

	mutation	effect in DMS (current study)	times_seen
0	M1I	-2.2303	8.75
1	M1T	-2.4823	3.50
2	F2L	0.2451	9.25
3	F2S	0.3820	10.00
4	F2V	0.3639	4.75
...	...	...	...
7211	S1252R	0.2863	81.00
7212	S1252T	-0.2164	117.75
7213	S1252V	0.3255	88.00
7214	S1252W	0.2297	32.00
7215	S1252Y	0.4444	138.25

7216 rows × 3 columns

Read the natural counts of mutations (from UShER tree), aggregate by amino-acid mutation (they are initially by codon mutation), and compute the log_2 enrichment of observed versus actual counts after adding a pseudocount:

[9]:

actual_vs_expected = pd.read_csv(config["actual_vs_expected_mut_counts"], low_memory=False)

if clade == "all":
    clades = actual_vs_expected["clade"].unique().tolist()
else:
    clades = [clade]

pseudocount = config["actual_vs_expected_pseudocount"]
min_expected = config["actual_vs_expected_min_expected"]

print(f"Reading mutation counts in {config['actual_vs_expected_mut_counts']}")
print(f"Using counts for these clades: {clades}")
print(f"Computing log2 enrichments using pseudocount of {pseudocount}")
print(f"Flagging to retain only mutations with >= {min_expected} expected counts")

actual_vs_expected = (
    actual_vs_expected
    .query("subset == 'all'")  # use sequences from all location
    .query("clade in @clades")  # get just the clade of interest
    .query("not exclude")  # ignore mutation specified to be excluded
    .query("gene == 'S'")  # just look at spike
    .query("not synonymous")  # just look at amino-acid mutations
    .query("mutant_aa != '*'")  # just look at amino-acid (not stop codon) mutations
    .groupby("aa_mutation", as_index=False)
    .aggregate({"actual_count": "sum", "expected_count": "sum"})
    .assign(
        log2_enrichment=lambda x: numpy.log(
            (x["actual_count"] + pseudocount) / (x["expected_count"] + pseudocount)
        ) / numpy.log(2),
        adequate_expected_counts=lambda x: x["expected_count"] >= min_expected,
    )
    .reset_index(drop=True)
)

Reading mutation counts in SARS2-mut-rates/results/expected_vs_actual_mut_counts/expected_vs_actual_mut_counts.csv
Using counts for these clades: ['21K']
Computing log2 enrichments using pseudocount of 0.5
Flagging to retain only mutations with >= 20 expected counts

Plot the distribution of expected counts for all mutations, to choose a good cutoff for these:

[10]:

expected_chart = (
    alt.Chart(
        actual_vs_expected
        .assign(
            rank=lambda x: x["expected_count"].rank(method="first", ascending=False)
        )
    )
    .encode(
        x=alt.X("rank", scale=alt.Scale(nice=False)),
        y="expected_count",
        color="adequate_expected_counts",
        tooltip=actual_vs_expected.columns.tolist(),
    )
    .mark_circle()
    .properties(height=200, width=500)
    .configure_axis(grid=False)
)

expected_chart

[10]:

[11]:

enrichment_histogram = (
    alt.Chart(actual_vs_expected)
    .encode(
        x=alt.X("log2_enrichment", bin=alt.Bin(maxbins=20)),
        y="count()",
        column="adequate_expected_counts",
        color="adequate_expected_counts",
    )
    .mark_bar()
    .properties(height=200, width=250)
)

enrichment_histogram

[11]:

Now get just the natural counts we are retaining:

[12]:

natural_enrichment = (
    actual_vs_expected
    .query("adequate_expected_counts")
    .drop(columns="adequate_expected_counts")
    .reset_index(drop=True)
    .rename(
        columns={
            "aa_mutation": "mutation",
            "log2_enrichment": "natural sequence enrichment (log2)",
            "actual_count": "natural sequence actual count",
            "expected_count": "natural sequence expected count",
        }
    )
)

natural_enrichment

[12]:

	mutation	natural sequence actual count	natural sequence expected count	natural sequence enrichment (log2)
0	A1015S	3	28.6290	-3.057029
1	A1015T	0	22.8920	-5.547943
2	A1015V	0	80.4500	-7.338959
3	A1016S	6	28.6290	-2.163945
4	A1016T	4	22.8920	-2.378018
...	...	...	...	...
1587	W633L	0	28.6290	-5.864384
1588	W64C	6	30.9348	-2.273851
1589	W64L	16	28.6290	-0.819990
1590	W886C	0	30.9348	-5.974291
1591	W886L	11	28.6290	-1.340822

1592 rows × 4 columns

Compute correlations among mutational effects and other properties¶

Correlate all experimental measures with each other. Here, each plot contains all the mutations that have measurement of the two variables of interest, so each scatter plot may have different mutations shown. The plots are interactive: you can mouse over points for details.

[13]:

# data frames and columns with variables to correlate
dfs_to_correlate = [
    (natural_enrichment, ["natural sequence enrichment (log2)"]),
    (muteffects, ["effect in DMS (current study)"]),
    (rbd_dms_starr, ["RBD ACE2 affinity (Starr et al)", "RBD expression (Starr et al)"]),
    (ntd_ouyang, ["NTD expression score (Ouyang et al)"]),
    (prefusion_tan, ["S2 expression score (Tan et al)", "S2 fusion score (Tan et al)"])
]

corr_charts = {}

selection_mutation = alt.selection_single(
    on="mouseover", fields=["mutation"], empty="none",
)

for (df1, cols1), (df2, cols2) in itertools.combinations(dfs_to_correlate, 2):

    merged_df = df1.merge(df2, on="mutation", validate="one_to_one")

    for col1, col2 in itertools.product(cols1, cols2):

        n = len(merged_df[merged_df[col1].notnull() & merged_df[col2].notnull()])

        if not n:
            continue

        r, p = scipy.stats.pearsonr(merged_df[col1], merged_df[col2])

        chart = (
            alt.Chart(merged_df)
            .encode(
                x=alt.X(col2, axis=alt.Axis(grid=False)),
                y=alt.Y(col1, axis=alt.Axis(grid=False)),
                tooltip=[
                    alt.Tooltip(c, format=".3g", title=c.replace("natural sequence ", ""))
                    if merged_df[c].dtype == float
                    else alt.Tooltip(c, title=c.replace("natural sequence ", ""))
                    for c in merged_df.columns
                ],
                opacity=alt.condition(selection_mutation, alt.value(1), alt.value(0.15)),
                color=alt.condition(selection_mutation, alt.value("orange"), alt.value("black")),
                size=alt.condition(selection_mutation, alt.value(55), alt.value(35)),
                strokeWidth=alt.condition(selection_mutation, alt.value(1.5), alt.value(0)),
            )
            .mark_circle(stroke="black")
            .properties(
                title=alt.TitleParams(
                    f"R={r:.2g}, N={n}", fontWeight="normal", fontSize=11, offset=-1,
                ),
                width=170,
                height=170,
            )
            .add_selection(selection_mutation)
        )

        corr_charts[(col1, col2)] = chart

Make a compound chart with all the correlation charts:

[14]:

charts_per_row = 4
chart_rows = []
for i in range(0, len(corr_charts), charts_per_row):
    chart_rows.append(alt.hconcat(*list(corr_charts.values())[i: i + charts_per_row]))

all_charts = alt.vconcat(*chart_rows)

chartfile = f"results/compare_muteffects/{clade}_natural_enrichment_vs_dms.html"
print(f"Saving to {chartfile}")
all_charts.save(chartfile)

all_charts

Saving to results/compare_muteffects/21K_natural_enrichment_vs_dms.html

[14]:

Make chart with just correlation of natural sequence enrichment and experiments, and just experiments with each other:

[15]:

ncols = 3

for vs_natural in [True, False]:
    print(f"\nCharts with {vs_natural=}")
    charts = [
        chart for (col1, _), chart in corr_charts.items()
        if col1.startswith("natural") == vs_natural
    ]
    chart_rows = []
    for i in range(0, len(charts), ncols):
        chart_rows.append(alt.hconcat(*charts[i: i + ncols]))
    chart = alt.vconcat(*chart_rows)
    display(chart)


Charts with vs_natural=True


Charts with vs_natural=False

Chart with just mutations with RBD DMS¶

Make a chart just with mutations having data on all of natural sequence enrichment, muteffects from the current DMS, and the Starr et al RBD DMS:

[16]:

merged_rbd_df = (
    muteffects
    .merge(rbd_dms_starr, on="mutation", validate="one_to_one")
    .merge(natural_enrichment, on="mutation", validate="one_to_one")
)

merged_rbd_cols = [
    "effect in DMS (current study)",
    "RBD ACE2 affinity (Starr et al)",
    "RBD expression (Starr et al)",
]

n = len(merged_rbd_df)
col1 = "natural sequence enrichment (log2)"

rbd_charts = []
for col2 in merged_rbd_cols:

    r, p = scipy.stats.pearsonr(merged_rbd_df[col1], merged_rbd_df[col2])

    rbd_charts.append(
        alt.Chart(merged_rbd_df)
        .encode(
            x=alt.X(col2, axis=alt.Axis(grid=False)),
            y=alt.Y(col1, axis=alt.Axis(grid=False)),
            tooltip=[
                alt.Tooltip(c, format=".3g", title=c.replace("natural sequence ", ""))
                if merged_rbd_df[c].dtype == float
                else alt.Tooltip(c, title=c.replace("natural sequence ", ""))
                for c in merged_rbd_df.columns
                ],
            opacity=alt.condition(selection_mutation, alt.value(1), alt.value(0.2)),
            size=alt.condition(selection_mutation, alt.value(55), alt.value(35)),
            strokeWidth=alt.condition(selection_mutation, alt.value(1.5), alt.value(0)),
        )
        .mark_circle(stroke="black")
        .properties(
            title=alt.TitleParams(
                f"R={r:.2g}, N={n}", fontWeight="normal", fontSize=11, offset=-1,
            ),
            width=170,
            height=170,
        )
        .add_selection(selection_mutation)
    )

rbd_chart = alt.hconcat(*rbd_charts)

rbd_chart

[16]:

Write data to correlate to a file¶

[17]:

all_dms_df = dfs_to_correlate[0][0]
for df, _ in dfs_to_correlate[1: ]:
    all_dms_df = all_dms_df.merge(df, on="mutation", validate="one_to_one", how="outer")

csvfile = f"results/compare_muteffects/{clade}_natural_enrichment_vs_dms.csv"
print(f"Writing to {csvfile}")
all_dms_df.to_csv(csvfile, index=False, float_format="%.5g")

Writing to results/compare_muteffects/21K_natural_enrichment_vs_dms.csv