# Imports
import os
import warnings
import scipy as sp
import pandas as pd
import altair as alt

# Plotting colors
# re-arranged for plot
tol_muted_adjusted = [
    "#AA4499",
    "#88CCEE",
    "#EE7733",
    "#000000",
    "#44AA99",
    "#1f78b4",
    "#CC6677",
    "#117733",
    "#999933",
    "#DDCC77",
    "#CC3311",
    "#882255",
    "#DDDDDD",
]

# Suppress warnings
warnings.simplefilter("ignore")

# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

# this cell is tagged as `parameters` for papermill parameterization
natural_sequence_variation = None
filtered_func_293T = None

html_dir = None
html_output = None

# Parameters
natural_sequence_variation = (
    "non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_protein_variation.csv"
)
filtered_func_293T = "results/filtered_func_effect_CSVs/293T_filtered_func_effects.csv"
html_dir = "results/func_scores_distributions/"
html_output = (
    "results/func_scores_distributions/func_scores_vs_natural_variation_by_region.html"
)

# # Uncomment for running interactive
# natural_sequence_variation = "../non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_protein_variation.csv"
# filtered_func_293T = "../results/filtered_func_effect_CSVs/293T_filtered_func_effects.csv"

# html_dir = "../results/func_scores_distributions/"
# html_output = "../results/func_scores_distributions/func_scores_vs_natural_variation_by_region.html"

# Load data as dataframe
natural_df = pd.read_csv(natural_sequence_variation)
functional_scores = (
    pd.read_csv(filtered_func_293T)
    .query("mutant != '*'")
    .reset_index(drop=True)
)

# Label SSP, GP1, and GP2 regions of GPC
functional_scores["region"] = (
    functional_scores.apply(
        lambda x: "SSP" if x["site"] <= 58 else ("GP1" if x["site"] <= 259 else "GP2"), axis=1
    )
)

# Average site functional effects
functional_scores = (
    functional_scores.groupby(["site", "wildtype", "region"])
    .aggregate({
        "effect" : "mean"
    })
    .reset_index()
)

# Add natural variation (effective amino acids) per site
functional_scores = (
    functional_scores.merge(
        natural_df[["site", "n_effective", "entropy"]],
        how="left",
        on=["site"],
        validate="many_to_one",
    )
)

subplots = []
for index,region in enumerate(["SSP", "GP1", "GP2", "all"]):

    # Subset data based on region
    subsetted_data = None
    if region == "all":
        subsetted_data = functional_scores
    else:
        subsetted_data = functional_scores.query("region == @region")

    # Calculate statistics
    r, p = sp.stats.pearsonr(
        subsetted_data.dropna()["effect"], 
        subsetted_data.dropna()["entropy"]
    )
    print(f"r correlation for {region}: {r:.2f}")
        
    curr_subplot = alt.Chart(subsetted_data, title=region).mark_point(
        filled=True, 
        color=tol_muted_adjusted[index], 
        size=50,
        opacity=0.35,
    ).encode(
        alt.X(
            "entropy",
            axis=alt.Axis(
                title=["site entropy in", "natural sequences"], 
                values=[0,0.5,1],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            scale=alt.Scale(domain=[-0.05, 1.3])
        ),
        alt.Y(
            "effect",
            axis=alt.Axis(
                title=["site mean", "effect on cell entry"], 
                values=[-4,-3,-2,-1,0,1],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            scale=alt.Scale(domain=[-4.1,1.1])
        ),
        tooltip=[
            "site",
            "wildtype",
            "effect",
            "n_effective",
            "entropy",
        ], 
    ).properties(
        width=150,
        height=150,
    )
    
    subplots.append(curr_subplot)

# Create final combined plot
natural_vs_func_effects = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    subplots[3],
    spacing=5,
    title="Natural variation vs functional effects for different GPC regions",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
).configure_view(
    stroke=None
)

# Make output dir if doesn't exist
if not os.path.exists(html_dir):
    os.mkdir(html_dir)

print(f"Saving to {html_output}")
natural_vs_func_effects.save(html_output)

natural_vs_func_effects

r correlation for SSP: 0.43
r correlation for GP1: 0.32
r correlation for GP2: 0.34
r correlation for all: 0.30

Saving to results/func_scores_distributions/func_scores_vs_natural_variation_by_region.html

subplots = []
for index,region in enumerate(["SSP", "GP1", "GP2", "all"]):

    # Subset data based on region
    subsetted_data = None
    if region == "all":
        subsetted_data = functional_scores
    else:
        subsetted_data = functional_scores.query("region == @region")

    # Calculate statistics
    r, p = sp.stats.pearsonr(
        subsetted_data.dropna()["effect"], 
        subsetted_data.dropna()["entropy"]
    )
    print(f"r correlation for {region}: {r:.2f}")
        
    curr_subplot = alt.Chart(subsetted_data, title=region).mark_point(
        filled=True, 
        color=tol_muted_adjusted[index], 
        size=20,
        opacity=0.35,
    ).encode(
        alt.X(
            "entropy",
            axis=alt.Axis(
                title=["site entropy in", "natural sequences"], 
                values=[0,0.5,1],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            scale=alt.Scale(domain=[-0.05, 1.3])
        ),
        alt.Y(
            "effect",
            axis=alt.Axis(
                title=["site mean", "effect on cell entry"], 
                values=[-4,-3,-2,-1,0,1],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            scale=alt.Scale(domain=[-4.1,1.1])
        ),
        tooltip=[
            "site",
            "wildtype",
            "effect",
            "n_effective",
            "entropy",
        ], 
    ).properties(
        width=70,
        height=70,
    )
    
    subplots.append(curr_subplot)

# Create final combined plot
natural_vs_func_effects = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    subplots[3],
    spacing=5,
    title="Natural variation vs functional effects for different GPC regions",
).configure_axis(
    grid=False,
    labelFontSize=8,
    titleFontSize=8,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=12,
).configure_view(
    stroke=None
)

natural_vs_func_effects

r correlation for SSP: 0.43
r correlation for GP1: 0.32
r correlation for GP2: 0.34
r correlation for all: 0.30

print(f'Average site entropy in SSP: {functional_scores.loc[functional_scores["region"] == "SSP"]["entropy"].mean()}')
print(f'Average site entropy in GP1: {functional_scores.loc[functional_scores["region"] == "GP1"]["entropy"].mean()}')
print(f'Average site entropy in GP2: {functional_scores.loc[functional_scores["region"] == "GP2"]["entropy"].mean()}')

# Calculate statistics 
U1, p = sp.stats.mannwhitneyu(
    functional_scores.loc[functional_scores["region"] == "SSP"]["entropy"],
    functional_scores.loc[functional_scores["region"] == "GP1"]["entropy"],
    method="exact",
)
print(f"SSP vs GP1 site entropy compared by Mann-Whitney U P value: {p}")
U1, p = sp.stats.mannwhitneyu(
    functional_scores.loc[functional_scores["region"] == "SSP"]["entropy"],
    functional_scores.loc[functional_scores["region"] == "GP2"]["entropy"],
    method="exact",
)
print(f"SSP vs GP2 site entropy compared by Mann-Whitney U P value: {p}")
U1, p = sp.stats.mannwhitneyu(
    functional_scores.loc[functional_scores["region"] == "GP1"]["entropy"],
    functional_scores.loc[functional_scores["region"] == "GP2"]["entropy"],
    method="exact",
)
print(f"GP1 vs GP2 site entropy compared by Mann-Whitney U P value: {p}")

# Plot score distrbutions for each region
DMS_scores = alt.Chart(
        functional_scores,
    ).mark_boxplot(
        outliers={"strokeWidth" : 1, "opacity" : 0.25},
        median={"thickness" : 1.5, "color" : "white"}
    ).encode(
    y=alt.Y(
        "region:N",
        title="GPC region",
        sort=["SSP", "GP1", "GP2"],
        axis=alt.Axis(
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
    ),
    x=alt.X(
        "effect:Q",
        title=["site mean", "effect on cell entry"],
        axis=alt.Axis(
            values=[-4,-3,-2,-1,0,1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-4.1, 1.1])
    ),
    # yOffset="jitter:Q",
    color=alt.Color(
        "region:N",
        scale=alt.Scale(
            domain=["SSP", "GP1", "GP2"], 
            range=tol_muted_adjusted
        ),
    ).legend(None),
    tooltip=[
        "site",
        "wildtype",
        alt.Tooltip(
            "effect", format=".2f", title="effect on cell entry"
        ),
        alt.Tooltip(
            "n_effective", format=".2f", title="n effective amino acids"
        ),
        alt.Tooltip(
            "entropy", format=".2f", title="entropy"
        ),
    ],
).properties(
    width=250, 
    height=150,
)

# Plot score distrbutions for each region
natural_variation = alt.Chart(
        functional_scores, 
    ).mark_boxplot(
        outliers={"strokeWidth" : 1, "opacity" : 0.25},
        median={"thickness" : 1.5, "color" : "white"}
    ).encode(
    y=alt.Y(
        "region:N",
        title=None,
        sort=["SSP", "GP1", "GP2"],
        axis=alt.Axis(
            domainWidth=1,
            domainColor="black",
            tickColor="black",
            labels=False,
        ),
    ),
    x=alt.X(
        "entropy:Q",
        title=["site entropy in", "natural sequences"],
        axis=alt.Axis( 
            values=[0,0.5,1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-0.05, 1.3])
    ),
    color=alt.Color(
        "region:N",
        scale=alt.Scale(
            domain=["SSP", "GP1", "GP2"], 
            range=tol_muted_adjusted
        ),
    ).legend(None),
).properties(
    width=250, 
    height=150,
)


# Create combined plot
combined_plot = alt.hconcat(
    DMS_scores,
    natural_variation,
    spacing=5,
    title="Natural variation vs functional effects for different GPC regions",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
).configure_view(
    stroke=None
)

combined_plot

Average site entropy in SSP: 0.15134594402588158
Average site entropy in GP1: 0.09785388604926164
Average site entropy in GP2: 0.07163804710519318

SSP vs GP1 site entropy compared by Mann-Whitney U P value: 0.6892441952492875

SSP vs GP2 site entropy compared by Mann-Whitney U P value: 0.04234231123425291

GP1 vs GP2 site entropy compared by Mann-Whitney U P value: 0.007342094556621821

# Plot score distrbutions for each region
DMS_scores = alt.Chart(
        functional_scores,
    ).mark_boxplot(
        outliers={"strokeWidth" : 1, "opacity" : 0.25},
        median={"thickness" : 1.5, "color" : "white"}
    ).encode(
    y=alt.Y(
        "region:N",
        title="GPC region",
        sort=["SSP", "GP1", "GP2"],
        axis=alt.Axis(
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
    ),
    x=alt.X(
        "effect:Q",
        title=["site mean", "effect on cell entry"],
        axis=alt.Axis(
            values=[-4,-3,-2,-1,0,1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-4.1, 1.1])
    ),
    color=alt.Color(
        "region:N",
        scale=alt.Scale(
            domain=["SSP", "GP1", "GP2"], 
            range=tol_muted_adjusted
        ),
    ).legend(None),
    tooltip=[
        "site",
        "wildtype",
        alt.Tooltip(
            "effect", format=".2f", title="effect on cell entry"
        ),
        alt.Tooltip(
            "n_effective", format=".2f", title="n effective amino acids"
        ),
        alt.Tooltip(
            "entropy", format=".2f", title="entropy"
        ),
    ],
).properties(
    width=90, 
    height=75,
)

# Plot score distrbutions for each region
natural_variation = alt.Chart(
        functional_scores, 
    ).mark_boxplot(
        outliers={"strokeWidth" : 1, "opacity" : 0.25, "size" : 5},
        median={"thickness" : 1.5, "color" : "white"}
    ).encode(
    y=alt.Y(
        "region:N",
        title=None,
        sort=["SSP", "GP1", "GP2"],
        axis=alt.Axis(
            domainWidth=1,
            domainColor="black",
            tickColor="black",
            labels=False,
        ),
    ),
    x=alt.X(
        "entropy:Q",
        title=["site entropy in", "natural sequences"],
        axis=alt.Axis( 
            values=[0,0.5,1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-0.05, 1.3])
    ),
    color=alt.Color(
        "region:N",
        scale=alt.Scale(
            domain=["SSP", "GP1", "GP2"], 
            range=tol_muted_adjusted
        ),
    ).legend(None),
).properties(
    width=90, 
    height=75,
)


# Create combined plot
combined_plot = alt.hconcat(
    DMS_scores,
    natural_variation,
    spacing=5,
    title="Natural variation vs functional effects for different GPC regions",
).configure_axis(
    grid=False,
    labelFontSize=8,
    titleFontSize=8,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=12,
).configure_view(
    stroke=None
)

combined_plot

Compare natural variation of GPC to DMS scores for specific regions of the protein¶