Compare functional effects of mutations to other data

This notebook compares the functional effects of mutations measured in the deep mutational scanning (on the latent phenotype) to several other sources of data:

  • The effects of mutations on expression and pre-fusion stability in a region of S2 as measured by Tan, …, Wu.

  • The effects of mutations on NTD expression as measured by Ouyang, …, Wu.

  • The effects of mutations on ACE2 affinity in the RBD as measured by Starr, …, Bloom

  • The number of counts of mutations in the UShER tree compared to that expected from the underlying mutation rate, as computed at https://github.com/jbloomlab/SARS2-mut-rates

First, import Python modules:

[1]:
import itertools
import os
import warnings

import altair as alt

import numpy

import pandas as pd

import scipy

import yaml


_ = alt.data_transformers.disable_max_rows()

warnings.filterwarnings("ignore", category=FutureWarning)  # altair deprecation warning

Read the configuration:

[2]:
# to run interactively, need the following
# os.chdir("../")

with open("config.yaml") as f:
    config = yaml.safe_load(f)

Get the data to compare

Read the Tan et al S2 prefusion data:

[3]:
# rename columns in this way
prefusion_tan_columns = {
    "mut": "mutation",
    "exp_score": "S2 expression score (Tan et al)",
    "fus_score": "S2 fusion score (Tan et al)",
}

# read prefusion data
print(f"Reading data from {config['prefusion_Tan_excel']}")
prefusion_tan = (
    pd.read_excel(config["prefusion_Tan_excel"])
    .query("mut_class == 'missense'")
    [prefusion_tan_columns.keys()]
    .rename(columns=prefusion_tan_columns)
)


prefusion_tan
Reading data from https://www.biorxiv.org/content/biorxiv/early/2022/09/26/2022.09.24.509341/DC2/embed/media-2.xlsx
[3]:
mutation S2 expression score (Tan et al) S2 fusion score (Tan et al)
0 T883C 0.039380 -0.075710
1 T883E 1.513755 0.696441
2 T883D 0.494139 1.346548
3 T883W 1.200064 1.473806
4 T883P 0.987880 0.458952
... ... ... ...
2881 L1034T 0.917240 1.004678
2882 L1034Y 0.289642 1.754473
2883 L1034I 0.699406 1.420713
2884 L1034F 0.774203 -0.490190
2885 L1034A 0.095609 0.576563

2886 rows × 3 columns

Read Ouyang et al NTD data:

[4]:
ntd_ouyang_columns = {
    "mut": "mutation",
    "Exp_score": "NTD expression score (Ouyang et al)",
}

print(f"Reading data from {config['ntd_Ouyang']}")
ntd_ouyang = (
    pd.read_csv(config["ntd_Ouyang"], sep="\t")
    .query("mut_class == 'missense'")
    .query("avg_total_freq >= 0.000075")  # QC cutoff in paper
    [ntd_ouyang_columns.keys()]
    .rename(columns=ntd_ouyang_columns)
    .reset_index(drop=True)
)

ntd_ouyang
Reading data from https://raw.githubusercontent.com/nicwulab/SARS-CoV-2_NTD_DMS/main/result/NTD_DMS_scores.tsv
[4]:
mutation NTD expression score (Ouyang et al)
0 Q14V 0.062160
1 C15S 0.389341
2 V16E 0.369432
3 V16D 0.595039
4 V16R 0.815205
... ... ...
3750 C301V 0.462155
3751 C301L 0.517864
3752 C301F 0.462618
3753 C301Y 0.965599
3754 C301W 0.892456

3755 rows × 2 columns

Read the Starr et al RBD deep mutational scanning:

[5]:
# rename columns in this way
rbd_dms_starr_columns = {
    "mutation": "mutation",
    "delta_bind": "RBD ACE2 affinity (Starr et al)",
    "delta_expr": "RBD expression (Starr et al)",
}

# read the data
target = config["rbd_dms_Starr_target"]
print(f"Reading data from {config['rbd_dms_Starr']} for target {target}")
rbd_dms_starr = (
    pd.read_csv(config["rbd_dms_Starr"])
    .query("target == @target")
    [rbd_dms_starr_columns.keys()]
    .rename(columns=rbd_dms_starr_columns)
)

rbd_dms_starr
Reading data from https://media.githubusercontent.com/media/jbloomlab/SARS-CoV-2-RBD_DMS_Omicron/f76ba2b2bec18ede9fa9da18c9ccc52389b1ba3a/results/final_variant_scores/final_variant_scores.csv for target Omicron_BA1
[5]:
mutation RBD ACE2 affinity (Starr et al) RBD expression (Starr et al)
0 N331A -0.24400 -0.83011
1 N331C -0.68319 -1.33642
2 N331D -0.22267 -0.54616
3 N331E -0.21514 -0.49162
4 N331F -0.40294 -1.18490
... ... ... ...
4015 T531S -0.01797 0.07250
4016 T531T 0.00000 0.00000
4017 T531V 0.02328 -0.05500
4018 T531W -0.00754 -0.10566
4019 T531Y 0.02979 -0.10553

4020 rows × 3 columns

Read the mutation effects on the observed phenotype as measured in the current project. Use reference site numbering and only get amino-acid mutations not involving gaps or stop codons:

[6]:
times_seen = config["muteffects_plot_kwargs"]["addtl_slider_stats"]["times_seen"]

print(f"Reading muteffects from {config['muteffects_observed']}")
print(f"Only keeping those with times_seen >= {times_seen}")

muteffects = (
    pd.read_csv(config["muteffects_observed"])
    .query("wildtype != mutant")
    .query("(wildtype not in ['-', '*']) and (mutant not in ['-', '*'])")
    .query("times_seen >= @times_seen")
    .assign(
        mutation=lambda x: x["wildtype"] + x["reference_site"].astype(str) + x["mutant"],
    )
    .rename(columns={"effect": "effect in DMS (current study)"})
    .reset_index(drop=True)
    [["mutation", "effect in DMS (current study)", "times_seen"]]
)

muteffects
Reading muteffects from results/muteffects_functional/muteffects_observed.csv
Only keeping those with times_seen >= 3
[6]:
mutation effect in DMS (current study) times_seen
0 M1I -2.2303 8.75
1 M1T -2.4823 3.50
2 F2L 0.2451 9.25
3 F2S 0.3820 10.00
4 F2V 0.3639 4.75
... ... ... ...
7211 S1252R 0.2863 81.00
7212 S1252T -0.2164 117.75
7213 S1252V 0.3255 88.00
7214 S1252W 0.2297 32.00
7215 S1252Y 0.4444 138.25

7216 rows × 3 columns

Read the natural counts of mutations (from UShER tree), aggregate by amino-acid mutation (they are initially by codon mutation), and compute the log_2 enrichment of observed versus actual counts after adding a pseudocount:

[7]:
actual_vs_expected = pd.read_csv(config["actual_vs_expected_mut_counts"], low_memory=False)

clades = config["actual_vs_expected_clades"]
if clades == "all":
    clades = actual_vs_expected["clade"].unique().tolist()

pseudocount = config["actual_vs_expected_pseudocount"]
min_expected = config["actual_vs_expected_min_expected"]

print(f"Reading mutation counts in {config['actual_vs_expected_mut_counts']}")
print(f"Using counts for these clades: {clades}")
print(f"Computing log2 enrichments using pseudocount of {pseudocount}")
print(f"Flagging to retain only mutations with >= {min_expected} expected counts")

actual_vs_expected = (
    actual_vs_expected
    .query("subset == 'all'")  # use sequences from all location
    .query("clade in @clades")  # get just the clade of interest
    .query("not exclude")  # ignore mutation specified to be excluded
    .query("gene == 'S'")  # just look at spike
    .query("not synonymous")  # just look at amino-acid mutations
    .query("mutant_aa != '*'")  # just look at amino-acid (not stop codon) mutations
    .groupby("aa_mutation", as_index=False)
    .aggregate({"actual_count": "sum", "expected_count": "sum"})
    .assign(
        log2_enrichment=lambda x: numpy.log(
            (x["actual_count"] + pseudocount) / (x["expected_count"] + pseudocount)
        ) / numpy.log(2),
        adequate_expected_counts=lambda x: x["expected_count"] >= min_expected,
    )
    .reset_index(drop=True)
)
Reading mutation counts in SARS2-mut-rates/results/expected_vs_actual_mut_counts/expected_vs_actual_mut_counts.csv
Using counts for these clades: ['20A', '20B', '20C', '20E', '20G', '20I', '21C', '21I', '21J', '21K', '21M', '22A', '22B']
Computing log2 enrichments using pseudocount of 0.5
Flagging to retain only mutations with >= 20 expected counts

Plot the distribution of expected counts for all mutations, to choose a good cutoff for these:

[8]:
expected_chart = (
    alt.Chart(
        actual_vs_expected
        .assign(
            rank=lambda x: x["expected_count"].rank(method="first", ascending=False)
        )
    )
    .encode(
        x=alt.X("rank", scale=alt.Scale(nice=False)),
        y="expected_count",
        color="adequate_expected_counts",
        tooltip=actual_vs_expected.columns.tolist(),
    )
    .mark_circle()
    .properties(height=200, width=500)
    .configure_axis(grid=False)
)

expected_chart
[8]:
[9]:
enrichment_histogram = (
    alt.Chart(actual_vs_expected)
    .encode(
        x=alt.X("log2_enrichment", bin=alt.Bin(maxbins=20)),
        y="count()",
        column="adequate_expected_counts",
        color="adequate_expected_counts",
    )
    .mark_bar()
    .properties(height=200, width=250)
)

enrichment_histogram
[9]:

Now get just the natural counts we are retaining:

[10]:
natural_enrichment = (
    actual_vs_expected
    .query("adequate_expected_counts")
    .drop(columns="adequate_expected_counts")
    .reset_index(drop=True)
    .rename(
        columns={
            "aa_mutation": "mutation",
            "log2_enrichment": "natural sequence enrichment (log2)",
            "actual_count": "natural sequence actual count",
            "expected_count": "natural sequence expected count",
        }
    )
)

natural_enrichment
[10]:
mutation natural sequence actual count natural sequence expected count natural sequence enrichment (log2)
0 A1015D 0 32.54165 -6.046214
1 A1015S 31 305.70510 -3.281075
2 A1015T 3 135.09420 -5.275797
3 A1015V 4 487.90890 -6.762021
4 A1016E 1 32.54165 -4.461251
... ... ... ... ...
3929 Y904H 12 55.53080 -2.164292
3930 Y917C 1 62.19742 -5.385372
3931 Y917H 2 55.53080 -4.486220
3932 Y91C 2 62.19742 -4.648406
3933 Y91H 6 55.53080 -3.107708

3934 rows × 4 columns

Compute correlations among mutational effects and other properties

Correlate all experimental measures with each other. Here, each plot contains all the mutations that have measurement of the two variables of interest, so each scatter plot may have different mutations shown. The plots are interactive: you can mouse over points for details.

[11]:
# data frames and columns with variables to correlate
dfs_to_correlate = [
    (natural_enrichment, ["natural sequence enrichment (log2)"]),
    (muteffects, ["effect in DMS (current study)"]),
    (rbd_dms_starr, ["RBD ACE2 affinity (Starr et al)", "RBD expression (Starr et al)"]),
    (ntd_ouyang, ["NTD expression score (Ouyang et al)"]),
    (prefusion_tan, ["S2 expression score (Tan et al)", "S2 fusion score (Tan et al)"])
]

corr_charts = {}

selection_mutation = alt.selection_single(
    on="mouseover", fields=["mutation"], empty="none",
)

for (df1, cols1), (df2, cols2) in itertools.combinations(dfs_to_correlate, 2):

    merged_df = df1.merge(df2, on="mutation", validate="one_to_one")

    for col1, col2 in itertools.product(cols1, cols2):

        n = len(merged_df[merged_df[col1].notnull() & merged_df[col2].notnull()])

        if not n:
            continue

        r, p = scipy.stats.pearsonr(merged_df[col1], merged_df[col2])

        chart = (
            alt.Chart(merged_df)
            .encode(
                x=alt.X(col2, axis=alt.Axis(grid=False)),
                y=alt.Y(col1, axis=alt.Axis(grid=False)),
                tooltip=[
                    alt.Tooltip(c, format=".3g", title=c.replace("natural sequence ", ""))
                    if merged_df[c].dtype == float
                    else alt.Tooltip(c, title=c.replace("natural sequence ", ""))
                    for c in merged_df.columns
                ],
                opacity=alt.condition(selection_mutation, alt.value(1), alt.value(0.15)),
                color=alt.condition(selection_mutation, alt.value("orange"), alt.value("black")),
                size=alt.condition(selection_mutation, alt.value(55), alt.value(35)),
                strokeWidth=alt.condition(selection_mutation, alt.value(1.5), alt.value(0)),
            )
            .mark_circle(stroke="black")
            .properties(
                title=alt.TitleParams(
                    f"R={r:.2g}, N={n}", fontWeight="normal", fontSize=11, offset=-1,
                ),
                width=170,
                height=170,
            )
            .add_selection(selection_mutation)
        )

        corr_charts[(col1, col2)] = chart

Make a compound chart with all the correlation charts:

[12]:
charts_per_row = 4
chart_rows = []
for i in range(0, len(corr_charts), charts_per_row):
    chart_rows.append(alt.hconcat(*list(corr_charts.values())[i: i + charts_per_row]))

all_charts = alt.vconcat(*chart_rows)

print(f"Saving to {config['natural_enrichment_vs_dms_plot']}")
all_charts.save(config["natural_enrichment_vs_dms_plot"])

all_charts
Saving to results/compare_muteffects/natural_enrichment_vs_dms.html
[12]:

Make chart with just correlation of natural sequence enrichment and experiments, and just experiments with each other:

[13]:
ncols = 3

for vs_natural in [True, False]:
    print(f"\nCharts with {vs_natural=}")
    charts = [
        chart for (col1, _), chart in corr_charts.items()
        if col1.startswith("natural") == vs_natural
    ]
    chart_rows = []
    for i in range(0, len(charts), ncols):
        chart_rows.append(alt.hconcat(*charts[i: i + ncols]))
    chart = alt.vconcat(*chart_rows)
    display(chart)

Charts with vs_natural=True

Charts with vs_natural=False

Chart with just mutations with RBD DMS

Make a chart just with mutations having data on all of natural sequence enrichment, muteffects from the current DMS, and the Starr et al RBD DMS:

[14]:
merged_rbd_df = (
    muteffects
    .merge(rbd_dms_starr, on="mutation", validate="one_to_one")
    .merge(natural_enrichment, on="mutation", validate="one_to_one")
)

merged_rbd_cols = [
    "effect in DMS (current study)",
    "RBD ACE2 affinity (Starr et al)",
    "RBD expression (Starr et al)",
]

n = len(merged_rbd_df)
col1 = "natural sequence enrichment (log2)"

rbd_charts = []
for col2 in merged_rbd_cols:

    r, p = scipy.stats.pearsonr(merged_rbd_df[col1], merged_rbd_df[col2])

    rbd_charts.append(
        alt.Chart(merged_rbd_df)
        .encode(
            x=alt.X(col2, axis=alt.Axis(grid=False)),
            y=alt.Y(col1, axis=alt.Axis(grid=False)),
            tooltip=[
                alt.Tooltip(c, format=".3g", title=c.replace("natural sequence ", ""))
                if merged_rbd_df[c].dtype == float
                else alt.Tooltip(c, title=c.replace("natural sequence ", ""))
                for c in merged_rbd_df.columns
                ],
            opacity=alt.condition(selection_mutation, alt.value(1), alt.value(0.2)),
            size=alt.condition(selection_mutation, alt.value(55), alt.value(35)),
            strokeWidth=alt.condition(selection_mutation, alt.value(1.5), alt.value(0)),
        )
        .mark_circle(stroke="black")
        .properties(
            title=alt.TitleParams(
                f"R={r:.2g}, N={n}", fontWeight="normal", fontSize=11, offset=-1,
            ),
            width=170,
            height=170,
        )
        .add_selection(selection_mutation)
    )

rbd_chart = alt.hconcat(*rbd_charts)

rbd_chart
[14]:

Write data to correlate to a file

[15]:
all_dms_df = dfs_to_correlate[0][0]
for df, _ in dfs_to_correlate[1: ]:
    all_dms_df = all_dms_df.merge(df, on="mutation", validate="one_to_one", how="outer")

print(f"Writing to {config['natural_enrichment_vs_dms']}")
all_dms_df.to_csv(config["natural_enrichment_vs_dms"], index=False, float_format="%.5g")
Writing to results/compare_muteffects/natural_enrichment_vs_dms.csv