# Imports
import os
import polyclonal
import pandas as pd
import altair as alt

# Plotting colors
# re-arranged for plot
tol_muted_adjusted = [
    "#AA4499",
    "#88CCEE",
    "#EE7733",
    "#44AA99",
    "#1f78b4",
    "#CC6677",
    "#117733",
    "#999933",
    "#DDCC77",
    "#CC3311",
    "#882255",
    "#000000",
    "#DDDDDD",
]

# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

# this cell is tagged as `parameters` for papermill parameterization
func_scores = None

min_times_seen = None
n_selections = None

html_dir = None
html_output = None

# Parameters
func_scores = "results/func_effects/averages/293T_entry_func_effects.csv"
min_times_seen = 2
n_selections = 8
html_dir = "results/func_scores_distributions/"
html_output = "results/func_scores_distributions/func_scores_distributions.html"

# # Uncomment for running interactive
# func_scores = "../results/func_effects/averages/293T_entry_func_effects.csv"

# min_times_seen = 2
# n_selections = 8

# html_dir = "../results/func_scores_distributions/"
# html_output = "../results/func_scores_distributions/func_scores_distributions.html"

# Filter functional scores based on min times seen and selection number
functional_scores = pd.read_csv(func_scores)

# Add dummy phenotype column
functional_scores["phenotype"] = "functional_effect"

# Rename effect column
functional_scores = functional_scores.rename(columns={"effect" : "effect on cell entry"})

# Plotting settings
alphabet = ['R','K','H','D','E','Q','N','S','T','Y','W','F','A','I','L','M','V','G','P','C','*']

addtl_tooltip_stats = [
    "n_selections",
    "times_seen",
]

addtl_slider_stats = {
    "times_seen" : 2,
    "n_selections" : 8,
}

# Alpha dystroglycan binding sites
DG_sites = [
    120,
    121,
    125,
    150, # supported by 151 and 125 interactions
    151,
    256,
    257,
    258,
]

DG_chart = polyclonal.plot.lineplot_and_heatmap(
    data_df=functional_scores,
    stat_col="effect on cell entry",
    category_col="phenotype",
    alphabet=alphabet,
    addtl_tooltip_stats=addtl_tooltip_stats,
    addtl_slider_stats=addtl_slider_stats,
    init_floor_at_zero=False,
    init_site_statistic="mean",
    show_zoombar=False,
    show_lineplot=False,
    sites=DG_sites,
    plot_title="\u03B1-DG binding residues",
    heatmap_max_at_least=2,
)

DG_chart

# LAMP1 binding sites
LAMP1_sites = [
    92, # histidine triad
    93, # histidine triad
    172,
    173,
    188,
    192,
    195,
    197,
    198,
    200,
    201,
    202,
    204,
    206,
    207,
    211,
    216,
    230, # histidine triad
]

LAMP1_chart = polyclonal.plot.lineplot_and_heatmap(
    data_df=functional_scores,
    stat_col="effect on cell entry",
    category_col="phenotype",
    alphabet=alphabet,
    addtl_tooltip_stats=addtl_tooltip_stats,
    addtl_slider_stats=addtl_slider_stats,
    init_floor_at_zero=False,
    init_site_statistic="mean",
    show_zoombar=False,
    show_lineplot=False,
    sites=LAMP1_sites,
    plot_title="LAMP1 binding residues",
    heatmap_max_at_least=2,
)

LAMP1_chart

segments = []
for segment in [(1,140),(141,280),(281,420),(421,491)]:
    curr_segment = polyclonal.plot.lineplot_and_heatmap(
        data_df=functional_scores,
        stat_col="effect on cell entry",
        category_col="phenotype",
        alphabet=alphabet,
        addtl_tooltip_stats=addtl_tooltip_stats,
        addtl_slider_stats=addtl_slider_stats,
        init_floor_at_zero=False,
        init_site_statistic="mean",
        show_zoombar=False,
        show_lineplot=False,
        heatmap_max_at_least=2,
        sites_to_show={"include_range" : segment},
    )
    
    curr_segment.display()

# Filter functional scores minimum times seen and selections 
# and remove stop codons
functional_scores = (
    functional_scores.loc[
        (functional_scores["times_seen"] >= min_times_seen)
        &
        (functional_scores["n_selections"] >= n_selections)
        &
        (functional_scores["mutant"] != "*")
    ]
)

# # **
# # Uncomment if want to compute site mean effects
# # Groupby site
# functional_scores = (
#     functional_scores
#     .groupby("site")
#     .aggregate({
#         "wildtype" : "first",
#         "effect on cell entry" : "mean",
#     })
#     .reset_index()
# )
# # **

# Label regions 
functional_scores["region"] = (
    functional_scores.apply(
        lambda x: "SSP" if x["site"] <= 58 else ("GP1" if x["site"] <= 259 else "GP2"), axis=1
    )
)


# Add GP2 transmembrane domain (428 - 447)
TM = list(range(428,448))
TM_sites = (
    functional_scores.loc[functional_scores["site"].isin(TM)].copy()
)
TM_sites["region"] = "TM"
functional_scores = (
    pd.concat([
        functional_scores, 
        TM_sites,
    ], ignore_index = True)
)

# Add GP2 cytoplasmic tail (448 - 491)
CT = range(448,492)
CT_sites = (
    functional_scores.loc[functional_scores["site"].isin(CT)].copy()
)
CT_sites["region"] = "CT"
functional_scores = (
    pd.concat([
        functional_scores, 
        CT_sites,
    ], ignore_index = True)
)

# Add DG binding sites
DG_binding_sites = (
    functional_scores.loc[functional_scores["site"].isin(DG_sites)].copy()
)
DG_binding_sites["region"] = "\u03B1-DG binding sites"
functional_scores = (
    pd.concat([
        functional_scores, 
        DG_binding_sites,
    ], ignore_index = True)
)

# Add LAMP1 binding sites
LAMP1_binding_sites = (
    functional_scores.loc[functional_scores["site"].isin(LAMP1_sites)].copy()
)
LAMP1_binding_sites["region"] = "LAMP1 binding sites"
functional_scores = (
    pd.concat([
        functional_scores, 
        LAMP1_binding_sites,
    ], ignore_index = True)
)

# Add glycosylation sites N - X - S/T
glycans = [
    79,80,81,
    89,90,91,
    99,100,101,
    109,110,111,
    119,120,121,
    167,168,169,
    224,225,226,
    365,366,367,
    373,374,375,
    390,391,392,
    395,396,397,
]
glycan_sites = (
    functional_scores.loc[functional_scores["site"].isin(glycans)].copy()
)
glycan_sites["region"] = "N-glycosylation sites"
functional_scores = (
    pd.concat([
        functional_scores, 
        glycan_sites,
    ], ignore_index = True)
)

# Plot score distrbutions for each region
distribution_plot = alt.Chart(
        functional_scores, title="Effect on cell entry for different GPC regions"
    ).mark_circle(opacity=0.15, size=75).encode(
    y=alt.Y(
        "region:N",
        title="GPC region",
        sort=None,
        axis=alt.Axis(
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
    ),
    x=alt.X(
        "effect on cell entry:Q",
        title="effect on cell entry",
        axis=alt.Axis(
            values=[-5,-4,-3,-2,-1,0,1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
    ),
    yOffset="jitter:Q",
    color=alt.Color(
        "region:N",
        scale=alt.Scale(
            domain=functional_scores["region"].unique().tolist(), 
            range=tol_muted_adjusted
        ),
    ).legend(None),
    tooltip=[
        "site",
        "wildtype",
        alt.Tooltip(
            "effect on cell entry", format=".2f", title="effect on cell entry"
        ),
    ],
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
)

# Plot median line
median_plot = alt.Chart(functional_scores).mark_tick(size=45, thickness=3, color="#000000").encode(
    y=alt.Y(
        "region:N",
        title="GPC region",
        sort=None,
        axis=alt.Axis(
            domainWidth=1,
        ),
    ),
    x=alt.X(
        "median(effect on cell entry):Q",
        axis=alt.Axis(
            values=[-5,-4,-3,-2,-1,0,1],
            domainWidth=1,
        ),
    ),
    tooltip=[
        "region:N",
        alt.Tooltip(
            "median(effect on cell entry):Q", format=".2f", title="median effect on cell entry",
        ),
    ],
)

# Combine striplot and median
combined_plot = (
    (distribution_plot + median_plot)
    .configure_axis(
        grid=False,
        labelFontSize=16,
        titleFontSize=16,
        labelFontWeight="normal",
        titleFontWeight="normal",
    )
    .properties(
        width=400, 
        height=400,
    )
    .configure_title(
        fontSize=24,
    ).configure_view(
        stroke=None
    )
)

# Make output dir if doesn't exist
if not os.path.exists(html_dir):
    os.mkdir(html_dir)

print(f"Saving to {html_output}")
combined_plot.save(html_output)

combined_plot

Saving to results/func_scores_distributions/func_scores_distributions.html

# Plot score distrbutions for each region
distribution_plot = alt.Chart(
        functional_scores,
    ).mark_circle(opacity=0.15, size=5).encode(
    y=alt.Y(
        "region:N",
        title="GPC region",
        sort=None,
        axis=alt.Axis(
            domainWidth=1,
        ),
    ),
    x=alt.X(
        "effect on cell entry:Q",
        title="effect on cell entry",
        axis=alt.Axis(
            values=[-5,-4,-3,-2,-1,0,1],
            domainWidth=1,
        ),
    ),
    yOffset="jitter:Q",
    color=alt.Color(
        "region:N",
        scale=alt.Scale(
            domain=functional_scores["region"].unique().tolist(), 
            range=tol_muted_adjusted
        ),
    ).legend(None),
    tooltip=[
        "site",
        "wildtype",
        alt.Tooltip(
            "effect on cell entry", format=".2f", title="effect on cell entry"
        ),
    ],
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
)

# Plot median line
median_plot = alt.Chart(functional_scores).mark_tick(size=10, thickness=1, color="#000000").encode(
    y=alt.Y(
        "region:N",
        title="GPC region",
        sort=None,
        axis=alt.Axis(
            domainWidth=1,
        ),
    ),
    x=alt.X(
        "median(effect on cell entry):Q",
        axis=alt.Axis(
            values=[-5,-4,-3,-2,-1,0,1],
            domainWidth=1,
        ),
    ),
    tooltip=[
        "region:N",
        alt.Tooltip(
            "median(effect on cell entry):Q", format=".2f", title="median effect on cell entry",
        ),
    ],
)

# Combine striplot and median
combined_plot = (
    (distribution_plot + median_plot)
    .configure_axis(
        grid=False,
        labelFontSize=8,
        titleFontSize=8,
        labelFontWeight="normal",
        titleFontWeight="normal",
    )
    .properties(
        width=150, 
        height=150,
    ).configure_view(
        stroke=None
    )
)

combined_plot

Visualize receptor binding regions and distribution of scores for different GPC regions¶

Heatmap of alpha-dystroglycan binding residues¶

Heatmap of LAMP1 binding residues¶

Show entire heatmap separated by ranges¶

Distributions of functional scores for different regions¶