Compare functional effects of mutations measured in DMS to the abundance in natural sequences¶

In [1]:
# Imports
import os
import warnings
import pandas as pd
import altair as alt
from Bio import SeqIO, AlignIO 

# Plotting colors
# re-arranged for plot
tol_muted_adjusted = [
    "#AA4499",
    "#88CCEE",
    "#EE7733",
    "#44AA99",
    "#1f78b4",
    "#CC6677",
    "#117733",
    "#999933",
    "#DDCC77",
    "#CC3311",
    "#882255",
    "#000000",
    "#DDDDDD",
]

# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
In [2]:
# this cell is tagged as `parameters` for papermill parameterization
natural_GPC_sequence_alignment = None
filtered_func_293T = None
In [3]:
# Parameters
natural_GPC_sequence_alignment = "non-pipeline_analyses/LASV_phylogeny_analysis/Results/LASV_GPC_protein_alignment.fasta"
filtered_func_293T = "results/filtered_func_effect_CSVs/293T_filtered_func_effects.csv"
In [4]:
# # Uncomment for running interactive
# natural_GPC_sequence_alignment = "../non-pipeline_analyses/LASV_phylogeny_analysis/Results/LASV_GPC_protein_alignment.fasta"
# filtered_func_293T = "../results/filtered_func_effect_CSVs/293T_filtered_func_effects.csv"

Process data

In [5]:
def get_natural_sequence_counts(site, amino_acid, natural_seqs_df):
    """
    Function that counts occurences of an amino acid at a site
    across a dataframe of sequences.
    """
    count = 0
    for seq in natural_seqs_df["sequence"].tolist():
        if seq[site-1] == amino_acid:
            count += 1
    return count

# Load alignment and metadata info
natural_seqs_df = pd.DataFrame(columns=["strain", "sequence"])

# Add alignment sequence to dataframe
for curr_fasta in AlignIO.read(natural_GPC_sequence_alignment, "fasta"):
    natural_seqs_df.loc[len(natural_seqs_df.index)] = [
        str(curr_fasta.id),
        str(curr_fasta.seq),
    ] 

# Load data as dataframe
functional_scores = pd.read_csv(filtered_func_293T)

# Filter for no stop codons
functional_scores = (
    functional_scores.query(
        "mutant != '*'"
    )
    .drop(columns=["times_seen", "effect_std", "n_selections"])
    .reset_index(drop=True)
)

# DMS strain
chosen_isolates = [
    "Josiah_NC_004296_reverse_complement_2018-08-13",
]

# Create subset of df for chosen isolates
validation_isolates = (
    natural_seqs_df.loc[natural_seqs_df["strain"].isin(chosen_isolates)]
    .reset_index(drop=True)
)

# Get Josiah sequence for comparison
josiah_sequence = validation_isolates.loc[validation_isolates["strain"] == "Josiah_NC_004296_reverse_complement_2018-08-13"].at[0,"sequence"]

# Create a wildtype df to fill in missing Josiah/wildtype measurements as 0
josiah_df = pd.DataFrame(
    zip(list(range(1,492)), josiah_sequence, josiah_sequence, [0]*491, [True]*491),
    columns=["site", "wildtype", "mutant", "effect", "Josiah reference"],
)

# Merge functional effects and josiah reference dataframes
AA_level_df = (
    pd.concat([functional_scores, josiah_df], ignore_index=True)
    .sort_values(by="site")
    .reset_index(drop=True)
    .fillna(False)
)

# Get natural sequence counts of each mutant and calculate mutation frequencies
# compared to the Josiah reference
AA_level_df["natural_counts"] = (
    AA_level_df.apply(lambda x: get_natural_sequence_counts(x["site"], x["mutant"], natural_seqs_df), axis=1)
)

# Mark mutations seen at N times seen in natural sequences
AA_level_df["natural_count_bins"] = "all DMS measured amino-acid mutations"
# At least once
at_least_once = (
    AA_level_df.loc[AA_level_df["natural_counts"] >= 1].copy()
)
at_least_once["natural_count_bins"] = "mutations observed >= 1 in natural GPCs"
AA_level_df = (
    pd.concat([
        AA_level_df, 
        at_least_once,
    ], ignore_index = True)
)
# At least X times
at_least_x = (
    AA_level_df.loc[AA_level_df["natural_counts"] >= 10].copy()
)
at_least_x["natural_count_bins"] = "mutations observed >= 10 in natural GPCs"
AA_level_df = (
    pd.concat([
        AA_level_df, 
        at_least_x,
    ], ignore_index = True)
)

# Remove any mutations that are the Josiah reference because it skews the median to 0
AA_level_df = AA_level_df.loc[(AA_level_df["wildtype"] != AA_level_df["mutant"])].copy()

Plot functional score distributions stratified by times observed in natural sequences

In [6]:
# Plot score distrbutions
distribution_plot = alt.Chart(
        AA_level_df, title="Effect on cell entry for mutations observed in natural GPCs"
    ).mark_boxplot(
    color="black", 
    outliers={"strokeWidth" : 1, "opacity" : 0.25},
    median={"thickness" : 1.5, "color" : "white"},
).encode(
    y=alt.Y(
        "natural_count_bins:N",
        title=None,
        sort=["mutations observed >= 10 in natural GPCs", "mutations observed >= 1 in natural GPCs", "all DMS measured amino-acid mutations"],
        axis=alt.Axis(
            domainWidth=1,
            domainColor="black",
            tickColor="black",
            labelLimit=400,
        ),
    ),
    x=alt.X(
        "effect:Q",
        title="effect on cell entry",
        axis=alt.Axis(
            values=[-4,-3,-2,-1,0,1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-4.5, 1.25])
    ),
)

# Combine plot
combined_plot = (
    (distribution_plot)
    .configure_axis(
        grid=False,
        labelFontSize=16,
        titleFontSize=16,
        labelFontWeight="normal",
        titleFontWeight="normal",
    ).properties(
        width=400, 
        height=100,
    ).configure_title(
        fontSize=24,
    ).configure_view(
        stroke=None
    )
)

combined_plot
Out[6]:

Recreate same plot above but formatted for paper

In [7]:
# Plot score distrbutions
distribution_plot = alt.Chart(
        AA_level_df, title="Effect on cell entry for mutations observed in natural GPCs"
    ).mark_boxplot(
    color="black", 
    outliers={"strokeWidth" : 1, "opacity" : 0.25, "size" : 10},
    median={"thickness" : 1.5, "color" : "white"},
).encode(
    y=alt.Y(
        "natural_count_bins:N",
        title=None,
        sort=["mutations observed >= 10 in natural GPCs", "mutations observed >= 1 in natural GPCs", "all DMS measured amino-acid mutations"],
        axis=alt.Axis(
            domainWidth=1,
            domainColor="black",
            tickColor="black",
            labelLimit=400,
        ),
    ),
    x=alt.X(
        "effect:Q",
        title="effect on cell entry",
        axis=alt.Axis(
            values=[-4,-3,-2,-1,0,1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-4.5, 1.25])
    ),
)

# Combine plot
combined_plot = (
    (distribution_plot)
    .configure_axis(
        grid=False,
        labelFontSize=8,
        titleFontSize=8,
        labelFontWeight="normal",
        titleFontWeight="normal",
    ).properties(
        width=100, 
        height=75,
    ).configure_title(
        fontSize=12,
    ).configure_view(
        stroke=None
    )
)

combined_plot
Out[7]: