Compare functional effects of mutations measured in DMS to the abundance in natural sequences¶
In [1]:
# Imports
import os
import warnings
import pandas as pd
import altair as alt
from Bio import SeqIO, AlignIO
# Plotting colors
# re-arranged for plot
tol_muted_adjusted = [
"#AA4499",
"#88CCEE",
"#EE7733",
"#44AA99",
"#1f78b4",
"#CC6677",
"#117733",
"#999933",
"#DDCC77",
"#CC3311",
"#882255",
"#000000",
"#DDDDDD",
]
# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
In [2]:
# this cell is tagged as `parameters` for papermill parameterization
natural_GPC_sequence_alignment = None
filtered_func_293T = None
In [3]:
# Parameters
natural_GPC_sequence_alignment = "non-pipeline_analyses/LASV_phylogeny_analysis/Results/LASV_GPC_protein_alignment.fasta"
filtered_func_293T = "results/filtered_func_effect_CSVs/293T_filtered_func_effects.csv"
In [4]:
# # Uncomment for running interactive
# natural_GPC_sequence_alignment = "../non-pipeline_analyses/LASV_phylogeny_analysis/Results/LASV_GPC_protein_alignment.fasta"
# filtered_func_293T = "../results/filtered_func_effect_CSVs/293T_filtered_func_effects.csv"
Process data
In [5]:
def get_natural_sequence_counts(site, amino_acid, natural_seqs_df):
"""
Function that counts occurences of an amino acid at a site
across a dataframe of sequences.
"""
count = 0
for seq in natural_seqs_df["sequence"].tolist():
if seq[site-1] == amino_acid:
count += 1
return count
# Load alignment and metadata info
natural_seqs_df = pd.DataFrame(columns=["strain", "sequence"])
# Add alignment sequence to dataframe
for curr_fasta in AlignIO.read(natural_GPC_sequence_alignment, "fasta"):
natural_seqs_df.loc[len(natural_seqs_df.index)] = [
str(curr_fasta.id),
str(curr_fasta.seq),
]
# Load data as dataframe
functional_scores = pd.read_csv(filtered_func_293T)
# Filter for no stop codons
functional_scores = (
functional_scores.query(
"mutant != '*'"
)
.drop(columns=["times_seen", "effect_std", "n_selections"])
.reset_index(drop=True)
)
# DMS strain
chosen_isolates = [
"Josiah_NC_004296_reverse_complement_2018-08-13",
]
# Create subset of df for chosen isolates
validation_isolates = (
natural_seqs_df.loc[natural_seqs_df["strain"].isin(chosen_isolates)]
.reset_index(drop=True)
)
# Get Josiah sequence for comparison
josiah_sequence = validation_isolates.loc[validation_isolates["strain"] == "Josiah_NC_004296_reverse_complement_2018-08-13"].at[0,"sequence"]
# Create a wildtype df to fill in missing Josiah/wildtype measurements as 0
josiah_df = pd.DataFrame(
zip(list(range(1,492)), josiah_sequence, josiah_sequence, [0]*491, [True]*491),
columns=["site", "wildtype", "mutant", "effect", "Josiah reference"],
)
# Merge functional effects and josiah reference dataframes
AA_level_df = (
pd.concat([functional_scores, josiah_df], ignore_index=True)
.sort_values(by="site")
.reset_index(drop=True)
.fillna(False)
)
# Get natural sequence counts of each mutant and calculate mutation frequencies
# compared to the Josiah reference
AA_level_df["natural_counts"] = (
AA_level_df.apply(lambda x: get_natural_sequence_counts(x["site"], x["mutant"], natural_seqs_df), axis=1)
)
# Mark mutations seen at N times seen in natural sequences
AA_level_df["natural_count_bins"] = "all DMS measured amino-acid mutations"
# At least once
at_least_once = (
AA_level_df.loc[AA_level_df["natural_counts"] >= 1].copy()
)
at_least_once["natural_count_bins"] = "mutations observed >= 1 in natural GPCs"
AA_level_df = (
pd.concat([
AA_level_df,
at_least_once,
], ignore_index = True)
)
# At least X times
at_least_x = (
AA_level_df.loc[AA_level_df["natural_counts"] >= 10].copy()
)
at_least_x["natural_count_bins"] = "mutations observed >= 10 in natural GPCs"
AA_level_df = (
pd.concat([
AA_level_df,
at_least_x,
], ignore_index = True)
)
# Remove any mutations that are the Josiah reference because it skews the median to 0
AA_level_df = AA_level_df.loc[(AA_level_df["wildtype"] != AA_level_df["mutant"])].copy()
Plot functional score distributions stratified by times observed in natural sequences
In [6]:
# Plot score distrbutions
distribution_plot = alt.Chart(
AA_level_df, title="Effect on cell entry for mutations observed in natural GPCs"
).mark_boxplot(
color="black",
outliers={"strokeWidth" : 1, "opacity" : 0.25},
median={"thickness" : 1.5, "color" : "white"},
).encode(
y=alt.Y(
"natural_count_bins:N",
title=None,
sort=["mutations observed >= 10 in natural GPCs", "mutations observed >= 1 in natural GPCs", "all DMS measured amino-acid mutations"],
axis=alt.Axis(
domainWidth=1,
domainColor="black",
tickColor="black",
labelLimit=400,
),
),
x=alt.X(
"effect:Q",
title="effect on cell entry",
axis=alt.Axis(
values=[-4,-3,-2,-1,0,1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-4.5, 1.25])
),
)
# Combine plot
combined_plot = (
(distribution_plot)
.configure_axis(
grid=False,
labelFontSize=16,
titleFontSize=16,
labelFontWeight="normal",
titleFontWeight="normal",
).properties(
width=400,
height=100,
).configure_title(
fontSize=24,
).configure_view(
stroke=None
)
)
combined_plot
Out[6]:
Recreate same plot above but formatted for paper
In [7]:
# Plot score distrbutions
distribution_plot = alt.Chart(
AA_level_df, title="Effect on cell entry for mutations observed in natural GPCs"
).mark_boxplot(
color="black",
outliers={"strokeWidth" : 1, "opacity" : 0.25, "size" : 10},
median={"thickness" : 1.5, "color" : "white"},
).encode(
y=alt.Y(
"natural_count_bins:N",
title=None,
sort=["mutations observed >= 10 in natural GPCs", "mutations observed >= 1 in natural GPCs", "all DMS measured amino-acid mutations"],
axis=alt.Axis(
domainWidth=1,
domainColor="black",
tickColor="black",
labelLimit=400,
),
),
x=alt.X(
"effect:Q",
title="effect on cell entry",
axis=alt.Axis(
values=[-4,-3,-2,-1,0,1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-4.5, 1.25])
),
)
# Combine plot
combined_plot = (
(distribution_plot)
.configure_axis(
grid=False,
labelFontSize=8,
titleFontSize=8,
labelFontWeight="normal",
titleFontWeight="normal",
).properties(
width=100,
height=75,
).configure_title(
fontSize=12,
).configure_view(
stroke=None
)
)
combined_plot
Out[7]: