Compare natural variation of GPC to DMS scores for specific regions of the protein¶
This notebook compares functional effects of mutations and the natural variation (i.e., effective amino acids) present at each site for different regions of GPC.
In [1]:
# Imports
import os
import warnings
import scipy as sp
import pandas as pd
import altair as alt
# Plotting colors
# re-arranged for plot
tol_muted_adjusted = [
"#AA4499",
"#88CCEE",
"#EE7733",
"#000000",
"#44AA99",
"#1f78b4",
"#CC6677",
"#117733",
"#999933",
"#DDCC77",
"#CC3311",
"#882255",
"#DDDDDD",
]
# Suppress warnings
warnings.simplefilter("ignore")
# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
In [2]:
# this cell is tagged as `parameters` for papermill parameterization
natural_sequence_variation = None
filtered_func_293T = None
html_dir = None
html_output = None
In [3]:
# Parameters
natural_sequence_variation = (
"non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_protein_variation.csv"
)
filtered_func_293T = "results/filtered_func_effect_CSVs/293T_filtered_func_effects.csv"
html_dir = "results/func_scores_distributions/"
html_output = (
"results/func_scores_distributions/func_scores_vs_natural_variation_by_region.html"
)
In [4]:
# # Uncomment for running interactive
# natural_sequence_variation = "../non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_protein_variation.csv"
# filtered_func_293T = "../results/filtered_func_effect_CSVs/293T_filtered_func_effects.csv"
# html_dir = "../results/func_scores_distributions/"
# html_output = "../results/func_scores_distributions/func_scores_vs_natural_variation_by_region.html"
In [5]:
# Load data as dataframe
natural_df = pd.read_csv(natural_sequence_variation)
functional_scores = (
pd.read_csv(filtered_func_293T)
.query("mutant != '*'")
.reset_index(drop=True)
)
# Label SSP, GP1, and GP2 regions of GPC
functional_scores["region"] = (
functional_scores.apply(
lambda x: "SSP" if x["site"] <= 58 else ("GP1" if x["site"] <= 259 else "GP2"), axis=1
)
)
# Average site functional effects
functional_scores = (
functional_scores.groupby(["site", "wildtype", "region"])
.aggregate({
"effect" : "mean"
})
.reset_index()
)
# Add natural variation (effective amino acids) per site
functional_scores = (
functional_scores.merge(
natural_df[["site", "n_effective", "entropy"]],
how="left",
on=["site"],
validate="many_to_one",
)
)
Plot correlations of site mean effects on cell entry and site effective amino acids for each region of GPC.
In [6]:
subplots = []
for index,region in enumerate(["SSP", "GP1", "GP2", "all"]):
# Subset data based on region
subsetted_data = None
if region == "all":
subsetted_data = functional_scores
else:
subsetted_data = functional_scores.query("region == @region")
# Calculate statistics
r, p = sp.stats.pearsonr(
subsetted_data.dropna()["effect"],
subsetted_data.dropna()["entropy"]
)
print(f"r correlation for {region}: {r:.2f}")
curr_subplot = alt.Chart(subsetted_data, title=region).mark_point(
filled=True,
color=tol_muted_adjusted[index],
size=50,
opacity=0.35,
).encode(
alt.X(
"entropy",
axis=alt.Axis(
title=["site entropy in", "natural sequences"],
values=[0,0.5,1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-0.05, 1.3])
),
alt.Y(
"effect",
axis=alt.Axis(
title=["site mean", "effect on cell entry"],
values=[-4,-3,-2,-1,0,1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-4.1,1.1])
),
tooltip=[
"site",
"wildtype",
"effect",
"n_effective",
"entropy",
],
).properties(
width=150,
height=150,
)
subplots.append(curr_subplot)
# Create final combined plot
natural_vs_func_effects = alt.hconcat(
subplots[0],
subplots[1],
subplots[2],
subplots[3],
spacing=5,
title="Natural variation vs functional effects for different GPC regions",
).configure_axis(
grid=False,
labelFontSize=16,
titleFontSize=16,
labelFontWeight="normal",
titleFontWeight="normal",
).configure_title(
fontSize=24,
).configure_view(
stroke=None
)
# Make output dir if doesn't exist
if not os.path.exists(html_dir):
os.mkdir(html_dir)
print(f"Saving to {html_output}")
natural_vs_func_effects.save(html_output)
natural_vs_func_effects
r correlation for SSP: 0.43 r correlation for GP1: 0.32 r correlation for GP2: 0.34 r correlation for all: 0.30
Saving to results/func_scores_distributions/func_scores_vs_natural_variation_by_region.html
Out[6]:
Recreate same plot as above but formatted for paper
In [7]:
subplots = []
for index,region in enumerate(["SSP", "GP1", "GP2", "all"]):
# Subset data based on region
subsetted_data = None
if region == "all":
subsetted_data = functional_scores
else:
subsetted_data = functional_scores.query("region == @region")
# Calculate statistics
r, p = sp.stats.pearsonr(
subsetted_data.dropna()["effect"],
subsetted_data.dropna()["entropy"]
)
print(f"r correlation for {region}: {r:.2f}")
curr_subplot = alt.Chart(subsetted_data, title=region).mark_point(
filled=True,
color=tol_muted_adjusted[index],
size=20,
opacity=0.35,
).encode(
alt.X(
"entropy",
axis=alt.Axis(
title=["site entropy in", "natural sequences"],
values=[0,0.5,1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-0.05, 1.3])
),
alt.Y(
"effect",
axis=alt.Axis(
title=["site mean", "effect on cell entry"],
values=[-4,-3,-2,-1,0,1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-4.1,1.1])
),
tooltip=[
"site",
"wildtype",
"effect",
"n_effective",
"entropy",
],
).properties(
width=70,
height=70,
)
subplots.append(curr_subplot)
# Create final combined plot
natural_vs_func_effects = alt.hconcat(
subplots[0],
subplots[1],
subplots[2],
subplots[3],
spacing=5,
title="Natural variation vs functional effects for different GPC regions",
).configure_axis(
grid=False,
labelFontSize=8,
titleFontSize=8,
labelFontWeight="normal",
titleFontWeight="normal",
).configure_title(
fontSize=12,
).configure_view(
stroke=None
)
natural_vs_func_effects
r correlation for SSP: 0.43 r correlation for GP1: 0.32 r correlation for GP2: 0.34 r correlation for all: 0.30
Out[7]:
Plot distributions of site mean effects on cell entry and effective amino acids for each region of GPC
In [8]:
print(f'Average site entropy in SSP: {functional_scores.loc[functional_scores["region"] == "SSP"]["entropy"].mean()}')
print(f'Average site entropy in GP1: {functional_scores.loc[functional_scores["region"] == "GP1"]["entropy"].mean()}')
print(f'Average site entropy in GP2: {functional_scores.loc[functional_scores["region"] == "GP2"]["entropy"].mean()}')
# Calculate statistics
U1, p = sp.stats.mannwhitneyu(
functional_scores.loc[functional_scores["region"] == "SSP"]["entropy"],
functional_scores.loc[functional_scores["region"] == "GP1"]["entropy"],
method="exact",
)
print(f"SSP vs GP1 site entropy compared by Mann-Whitney U P value: {p}")
U1, p = sp.stats.mannwhitneyu(
functional_scores.loc[functional_scores["region"] == "SSP"]["entropy"],
functional_scores.loc[functional_scores["region"] == "GP2"]["entropy"],
method="exact",
)
print(f"SSP vs GP2 site entropy compared by Mann-Whitney U P value: {p}")
U1, p = sp.stats.mannwhitneyu(
functional_scores.loc[functional_scores["region"] == "GP1"]["entropy"],
functional_scores.loc[functional_scores["region"] == "GP2"]["entropy"],
method="exact",
)
print(f"GP1 vs GP2 site entropy compared by Mann-Whitney U P value: {p}")
# Plot score distrbutions for each region
DMS_scores = alt.Chart(
functional_scores,
).mark_boxplot(
outliers={"strokeWidth" : 1, "opacity" : 0.25},
median={"thickness" : 1.5, "color" : "white"}
).encode(
y=alt.Y(
"region:N",
title="GPC region",
sort=["SSP", "GP1", "GP2"],
axis=alt.Axis(
domainWidth=1,
domainColor="black",
tickColor="black",
),
),
x=alt.X(
"effect:Q",
title=["site mean", "effect on cell entry"],
axis=alt.Axis(
values=[-4,-3,-2,-1,0,1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-4.1, 1.1])
),
# yOffset="jitter:Q",
color=alt.Color(
"region:N",
scale=alt.Scale(
domain=["SSP", "GP1", "GP2"],
range=tol_muted_adjusted
),
).legend(None),
tooltip=[
"site",
"wildtype",
alt.Tooltip(
"effect", format=".2f", title="effect on cell entry"
),
alt.Tooltip(
"n_effective", format=".2f", title="n effective amino acids"
),
alt.Tooltip(
"entropy", format=".2f", title="entropy"
),
],
).properties(
width=250,
height=150,
)
# Plot score distrbutions for each region
natural_variation = alt.Chart(
functional_scores,
).mark_boxplot(
outliers={"strokeWidth" : 1, "opacity" : 0.25},
median={"thickness" : 1.5, "color" : "white"}
).encode(
y=alt.Y(
"region:N",
title=None,
sort=["SSP", "GP1", "GP2"],
axis=alt.Axis(
domainWidth=1,
domainColor="black",
tickColor="black",
labels=False,
),
),
x=alt.X(
"entropy:Q",
title=["site entropy in", "natural sequences"],
axis=alt.Axis(
values=[0,0.5,1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-0.05, 1.3])
),
color=alt.Color(
"region:N",
scale=alt.Scale(
domain=["SSP", "GP1", "GP2"],
range=tol_muted_adjusted
),
).legend(None),
).properties(
width=250,
height=150,
)
# Create combined plot
combined_plot = alt.hconcat(
DMS_scores,
natural_variation,
spacing=5,
title="Natural variation vs functional effects for different GPC regions",
).configure_axis(
grid=False,
labelFontSize=16,
titleFontSize=16,
labelFontWeight="normal",
titleFontWeight="normal",
).configure_title(
fontSize=24,
).configure_view(
stroke=None
)
combined_plot
Average site entropy in SSP: 0.15134594402588158 Average site entropy in GP1: 0.09785388604926164 Average site entropy in GP2: 0.07163804710519318
SSP vs GP1 site entropy compared by Mann-Whitney U P value: 0.6892441952492875
SSP vs GP2 site entropy compared by Mann-Whitney U P value: 0.04234231123425291
GP1 vs GP2 site entropy compared by Mann-Whitney U P value: 0.007342094556621821
Out[8]:
Recreate same plot as above but formatted for paper
In [9]:
# Plot score distrbutions for each region
DMS_scores = alt.Chart(
functional_scores,
).mark_boxplot(
outliers={"strokeWidth" : 1, "opacity" : 0.25},
median={"thickness" : 1.5, "color" : "white"}
).encode(
y=alt.Y(
"region:N",
title="GPC region",
sort=["SSP", "GP1", "GP2"],
axis=alt.Axis(
domainWidth=1,
domainColor="black",
tickColor="black",
),
),
x=alt.X(
"effect:Q",
title=["site mean", "effect on cell entry"],
axis=alt.Axis(
values=[-4,-3,-2,-1,0,1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-4.1, 1.1])
),
color=alt.Color(
"region:N",
scale=alt.Scale(
domain=["SSP", "GP1", "GP2"],
range=tol_muted_adjusted
),
).legend(None),
tooltip=[
"site",
"wildtype",
alt.Tooltip(
"effect", format=".2f", title="effect on cell entry"
),
alt.Tooltip(
"n_effective", format=".2f", title="n effective amino acids"
),
alt.Tooltip(
"entropy", format=".2f", title="entropy"
),
],
).properties(
width=90,
height=75,
)
# Plot score distrbutions for each region
natural_variation = alt.Chart(
functional_scores,
).mark_boxplot(
outliers={"strokeWidth" : 1, "opacity" : 0.25, "size" : 5},
median={"thickness" : 1.5, "color" : "white"}
).encode(
y=alt.Y(
"region:N",
title=None,
sort=["SSP", "GP1", "GP2"],
axis=alt.Axis(
domainWidth=1,
domainColor="black",
tickColor="black",
labels=False,
),
),
x=alt.X(
"entropy:Q",
title=["site entropy in", "natural sequences"],
axis=alt.Axis(
values=[0,0.5,1],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-0.05, 1.3])
),
color=alt.Color(
"region:N",
scale=alt.Scale(
domain=["SSP", "GP1", "GP2"],
range=tol_muted_adjusted
),
).legend(None),
).properties(
width=90,
height=75,
)
# Create combined plot
combined_plot = alt.hconcat(
DMS_scores,
natural_variation,
spacing=5,
title="Natural variation vs functional effects for different GPC regions",
).configure_axis(
grid=False,
labelFontSize=8,
titleFontSize=8,
labelFontWeight="normal",
titleFontWeight="normal",
).configure_title(
fontSize=12,
).configure_view(
stroke=None
)
combined_plot
Out[9]: