# this cell is tagged as parameters for `papermill` parameterization
# configs
altair_config = None
nipah_config = None
# inputs
binding_data = None
HENV103_filter = None
HENV117_filter = None
HENV26_filter = None
HENV32_filter = None
m102_filter = None
nAH1_filter = None
func_scores_low_effect_E3_file = None
concat_df_file = None
# outputs
escape_bubble_plot = None
bubble_1_mut_plot = None
mab_line_escape_plot = None
aggregate_mab_and_binding = None
aggregate_mab_and_niv_polymorphism = None
mab_plot_top = None
mab_plot_all = None
combined_evol_sites_escape = None

# Parameters
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
HENV103_filter = "results/filtered_data/escape/HENV103_escape_filtered.csv"
HENV117_filter = "results/filtered_data/escape/HENV117_escape_filtered.csv"
HENV26_filter = "results/filtered_data/escape/HENV26_escape_filtered.csv"
HENV32_filter = "results/filtered_data/escape/HENV32_escape_filtered.csv"
m102_filter = "results/filtered_data/escape/m102_escape_filtered.csv"
nAH1_filter = "results/filtered_data/escape/nAH1_escape_filtered.csv"
func_scores_low_effect_E3_file = (
    "results/filtered_data/escape/e3_low_mab_effect_filter.csv"
)
concat_df_file = "results/filtered_data/escape/mab_filter_concat.csv"
binding_data = "results/filtered_data/binding/e2_binding_filtered.csv"
escape_bubble_plot = "results/images/escape_bubble_plot.html"
overlap_escape_plot = "results/images/overlap_escape_plot.html"
mab_line_escape_plot = "results/images/mab_line_escape_plot.html"
mab_plot_top = "results/images/mab_plot_top.html"
mab_plot_all = "results/images/mab_plot_all.html"
aggregate_mab_and_binding = "results/images/aggregate_mab_and_binding.html"
combined_evol_sites_escape = "results/images/combined_evol_sites_escape.html"
aggregate_mab_and_niv_polymorphism = (
    "results/images/aggregate_mab_and_niv_polymorphism.html"
)

import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import Bio.SeqIO
import yaml
import matplotlib

matplotlib.rcParams["svg.fonttype"] = "none"

from Bio import PDB
import dmslogo
from dmslogo.colorschemes import CBPALETTE
from dmslogo.colorschemes import ValueToColorMap
import sys

# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

# setup working directory
if (
    os.getcwd()
    == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
):
    pass
    print("Already in correct directory")
else:
    os.chdir(
        "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
    )
    print("Setup in correct directory")

# import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append("data/custom_analyses_data/")
import theme

alt.themes.register("main_theme", theme.main_theme)
alt.themes.enable("main_theme")

Setup in correct directory

ThemeRegistry.enable('main_theme')

if nipah_config is None:
    print("this notebook is being run manually")
    # configs
    nipah_config = "nipah_config.yaml"
    # input
    HENV103_filter = "results/filtered_data/escape/HENV103_escape_filtered.csv"
    HENV117_filter = "results/filtered_data/escape/HENV117_escape_filtered.csv"
    HENV26_filter = "results/filtered_data/escape/HENV26_escape_filtered.csv"
    HENV32_filter = "results/filtered_data/escape/HENV32_escape_filtered.csv"
    m102_filter = "results/filtered_data/escape/m102_escape_filtered.csv"
    nAH1_filter = "results/filtered_data/escape/nAH1_escape_filtered.csv"

    func_scores_low_effect_E3_file = (
        "results/filtered_data/escape/e3_low_mab_effect_filter.csv"
    )
    concat_df_file = "results/filtered_data/escape/mab_filter_concat.csv"
    binding_data = "results/filtered_data/binding/e2_binding_filtered.csv"
else:
    print("this notebook was run with papermill")

this notebook was run with papermill

with open(nipah_config) as f:
    config = yaml.safe_load(f)

func_scores_E3_low_effect = pd.read_csv(
    func_scores_low_effect_E3_file
)  # read low effect cell entry data for making heatmaps
combined_df = pd.read_csv(
    concat_df_file
)  # read filtered concatanated antibody escape data
bind_df = pd.read_csv(binding_data)  # read filtered ephrin-b2 binding data

def find_top_sites(df, ab):
    tmp_df = df[df["ab"] == ab]  # filter
    top_escape_df = tmp_df.query("top_escape == True")  # find max escape sites
    sum_site_list = top_escape_df["site"].unique().tolist()
    print(sum_site_list)
    return sum_site_list


# call the function and make site list for each mAb
m102_sites = find_top_sites(combined_df, "m102.4")
HENV26_sites = find_top_sites(combined_df, "HENV-26")
HENV117_sites = find_top_sites(combined_df, "HENV-117")
HENV103_sites = find_top_sites(combined_df, "HENV-103")
HENV32_sites = find_top_sites(combined_df, "HENV-32")
nAH1_sites = find_top_sites(combined_df, "nAH1.3")

# make combined lists of antibodies depending on epitope
binding_face_list = list(set(m102_sites + HENV26_sites + HENV117_sites))
dimer_face_list = list(set(HENV103_sites + HENV32_sites))
everything_list = list(
    set(
        m102_sites
        + HENV26_sites
        + HENV117_sites
        + HENV103_sites
        + HENV32_sites
        + nAH1_sites
    )
)

[239, 507, 559, 577, 582, 586, 589]
[491, 494, 501, 530]
[171, 258, 555, 580, 582, 583, 586, 587]
[205, 258, 259, 260, 264]
[199, 200, 205]
[184, 185, 186, 187, 188, 189, 190, 447, 448, 449, 450, 451, 452, 468, 513, 515, 516, 517, 518, 519, 520, 597]

def find_sites_for_ab(df, site_list, ab_list, color_by):
    tmp_df = combined_df[combined_df["site"].isin(site_list)]
    tmp_df = tmp_df[tmp_df["ab"].isin(ab_list)]
    # Add wildtype residue info for logo plot
    tmp_df["wildtype_site"] = tmp_df["wildtype"].astype(str) + tmp_df["site"].astype(
        str
    )
    # Find colors based on effect
    tmp_df["clip"] = np.clip(tmp_df[color_by], None, 0)
    min_prop = tmp_df[color_by].min()
    max_prop = tmp_df["clip"].max()
    map1 = ValueToColorMap(minvalue=(min_prop - 0.5), maxvalue=max_prop, cmap="YlGn")
    tmp_df["color"] = tmp_df["clip"].map(map1.val_to_color)
    return tmp_df


# get information for binding face antibodies
binding_face_df = find_sites_for_ab(
    combined_df, binding_face_list, ["m102.4", "HENV-26", "HENV-117"], "effect"
)

# Want to put them in a specific order so do the following:
custom_order = ["m102.4", "HENV-117", "HENV-26"]
binding_face_df["ab"] = pd.Categorical(
    binding_face_df["ab"], categories=custom_order, ordered=True
)
binding_face_df = binding_face_df.sort_values(by="ab")

# get dimer face information
dimer_face_df = find_sites_for_ab(
    combined_df, dimer_face_list, ["HENV-103", "HENV-32"], "effect"
)
# get information about nAH1.3
nAH1_df = find_sites_for_ab(combined_df, nAH1_sites, ["nAH1.3"], "effect")

# make a df for sites that have opposite effects on mab escape
custom_order = ["m102.4", "HENV-117", "HENV-26", "HENV-103", "HENV-32"]
fusion_trig_df = find_sites_for_ab(
    combined_df,
    [165, 166, 167, 168, 169, 170, 171, 172, 204, 208, 209, 270, 589],
    ["m102.4", "HENV-117", "HENV-26", "HENV-32", "HENV-103"],
    "effect",
)
fusion_trig_df["ab"] = pd.Categorical(
    fusion_trig_df["ab"], categories=custom_order, ordered=True
)
fusion_trig_df = fusion_trig_df.sort_values(by="ab")

def generate_facet_logo_plot(df, output_file_name):
    """Generate logo plot and save as a file."""
    draw_logo_kwargs = {
        "letter_col": "mutant",
        "color_col": "color",
        "xtick_col": "wildtype_site",
        "letter_height_col": "escape_mean",
        "xlabel": "",
        "clip_negative_heights": True,
    }
    fig, ax = dmslogo.facet_plot(
        data=df,
        x_col="site",
        gridrow_col="ab",
        share_ylim_across_rows=False,
        show_col=None,
        draw_logo_kwargs=draw_logo_kwargs,
    )
    fig
    fig.savefig(output_file_name, bbox_inches="tight", format="svg")


# Call functions and save logo plots
generate_facet_logo_plot(
    binding_face_df, "results/images/logo_plots/binding_face_escape.svg"
)
generate_facet_logo_plot(
    binding_face_df.query("min_mutations == 1"),
    "results/images/logo_plots/binding_face_escape_1_mutant_away.svg",
)

generate_facet_logo_plot(
    dimer_face_df, "results/images/logo_plots/dimer_face_escape.svg"
)
generate_facet_logo_plot(
    dimer_face_df.query("min_mutations ==1"),
    "results/images/logo_plots/dimer_face_escape_1_mutant_away.svg",
)

generate_facet_logo_plot(nAH1_df, "results/images/logo_plots/nAH1_escape_solo.svg")
generate_facet_logo_plot(
    nAH1_df.query("min_mutations ==1"),
    "results/images/logo_plots/nAH1_escape_solo_1_mutant_away.svg",
)

def generate_facet_logo_plot_binding(df, output_file_name):
    """Generate logo plot and save as a file."""
    draw_logo_kwargs = {
        "letter_col": "mutant",
        "color_col": "color",
        "xtick_col": "wildtype_site",
        "letter_height_col": "escape_mean",
        "xlabel": "",
        "clip_negative_heights": False,
    }
    fig, ax = dmslogo.facet_plot(
        data=df,
        x_col="site",
        gridrow_col="ab",
        share_ylim_across_rows=False,
        show_col=None,
        draw_logo_kwargs=draw_logo_kwargs,
    )
    fig
    fig.savefig(output_file_name, bbox_inches="tight", format="svg")


generate_facet_logo_plot_binding(
    fusion_trig_df, "results/images/logo_plots/fusion_trig.svg"
)

def find_combined_sites(list1, list2):
    # function to find unique elements in two lists and return one sorted list
    combined_list = list(list1) + list(list2)
    unique_elements = set(combined_list)
    sorted_list = sorted(unique_elements)
    return sorted_list


def find_top_mean_escape_sites(df):
    # function to find the top sites of escape for each antibody
    for ab in ["m102.4", "HENV-26", "HENV-117", "HENV-103", "HENV-32", "nAH1.3"]:
        # select specific antibody
        tmp_df = df[df["ab"] == ab]
        # find max escape sites
        tmp_df = tmp_df.groupby(["site"])["escape_mean"].mean().reset_index()
        max_df = tmp_df.sort_values(by="escape_mean", ascending=False).head(10)
        max_sites = list(max_df["site"].unique())
        print(f"For {ab}, top averaged escape sites are :{max_sites}")

        # find top summed escape sites
        agg_df = tmp_df.groupby(["site"])["escape_mean"].sum().reset_index()
        tmp_df_sort = agg_df.sort_values(by="escape_mean", ascending=False).head(10)
        summed_sites = list(tmp_df_sort["site"].unique())
        print(f"For {ab}, top summed escape sites are: {summed_sites}")

        # call function to return unique sorted list
        combined_list = find_combined_sites(max_sites, summed_sites)
        print(f"The combined list for {ab} is: {combined_list}\n")


# Call function
find_top_mean_escape_sites(combined_df)

For m102.4, top averaged escape sites are :[243, 239, 532, 582, 507, 531, 589, 526, 586, 494]
For m102.4, top summed escape sites are: [243, 239, 532, 582, 507, 531, 589, 526, 586, 494]
The combined list for m102.4 is: [239, 243, 494, 507, 526, 531, 532, 582, 586, 589]

For HENV-26, top averaged escape sites are :[491, 494, 530, 501, 529, 531, 258, 257, 171, 208]
For HENV-26, top summed escape sites are: [491, 494, 530, 501, 529, 531, 258, 257, 171, 208]
The combined list for HENV-26 is: [171, 208, 257, 258, 491, 494, 501, 529, 530, 531]

For HENV-117, top averaged escape sites are :[258, 580, 586, 582, 171, 257, 204, 208, 587, 589]
For HENV-117, top summed escape sites are: [258, 580, 586, 582, 171, 257, 204, 208, 587, 589]
The combined list for HENV-117 is: [171, 204, 208, 257, 258, 580, 582, 586, 587, 589]

For HENV-103, top averaged escape sites are :[258, 260, 205, 259, 264, 277, 156, 274, 151, 155]
For HENV-103, top summed escape sites are: [258, 260, 205, 259, 264, 277, 156, 274, 151, 155]
The combined list for HENV-103 is: [151, 155, 156, 205, 258, 259, 260, 264, 274, 277]

For HENV-32, top averaged escape sites are :[199, 200, 585, 205, 593, 534, 556, 552, 531, 535]
For HENV-32, top summed escape sites are: [199, 200, 585, 205, 593, 534, 556, 552, 531, 535]
The combined list for HENV-32 is: [199, 200, 205, 531, 534, 535, 552, 556, 585, 593]

For nAH1.3, top averaged escape sites are :[468, 185, 517, 450, 447, 189, 516, 451, 184, 188]
For nAH1.3, top summed escape sites are: [468, 185, 517, 450, 447, 189, 516, 451, 184, 188]
The combined list for nAH1.3 is: [184, 185, 188, 189, 447, 450, 451, 468, 516, 517]

def identify_escape_sites(df, ab):
    subset = df[(df["ab"] == ab)]
    unique_sites = list(subset["site"].unique())
    return unique_sites


abs = ["HENV-26", "HENV-103", "HENV-32", "HENV-117", "m102.4", "nAH1.3"]
sites_dict = {}  # Create an empty dictionary to store the results

for ab in abs:
    sites_dict[ab] = identify_escape_sites(combined_df.query("top_escape == True"), ab)

order_ab = ["m102.4", "HENV-26", "HENV-117", "HENV-103", "HENV-32", "nAH1.3"]


def generate_bubble_chart(df):
    df = df.round(2)
    variant_selector = alt.selection_point(
        on="mouseover", empty=False, fields=["site"], value=1
    )
    chart = (
        alt.Chart(
            df,
            title=alt.Title(
                "Top Antibody Escape Mutations",
                subtitle="Hover over points to see escape at same site",
            ),
        )
        .mark_point(stroke="black", filled=True)
        .encode(
            x=alt.X(
                "ab:O",
                sort=order_ab,
                title="Antibody",
                axis=alt.Axis(labelAngle=-90, grid=False),
            ),
            y=alt.Y(
                "effect:Q",
                title="Cell Entry of Top Escape",
                axis=alt.Axis(
                    grid=True, tickCount=4, values=[0.5, 0, -0.5, -1, -1.5, -2]
                ),
            ),
            size=alt.Size(
                "escape_mean", legend=alt.Legend(title="Mean Escape By Mutation")
            ),
            xOffset="random:Q",
            tooltip=[
                "site",
                "wildtype",
                "mutant",
                "ab",
                "effect",
                "escape_mean",
                "escape_std",
            ],
            color=alt.Color("ab").legend(None),
            opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.2)),
            strokeWidth=alt.condition(variant_selector, alt.value(2), alt.value(0)),
        )
        .transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())")
        .properties(width=400, height=400)
        .add_params(variant_selector)
    )

    return chart


escape_bubble = generate_bubble_chart(
    combined_df.query("top_escape == True & escape_mean > 0.25")
)
escape_bubble.display()
if mab_line_escape_plot is not None:
    escape_bubble.save(escape_bubble_plot)

def find_overlapping_escape(df):
    slider = alt.binding_range(
        min=config["min_func_effect_for_ab"], max=0, step=0.25, name="effect"
    )
    selector = alt.param(name="SelectorName", value=-4, bind=slider)

    radio = alt.binding_radio(options=[1, 2, 3], name="Min Mutations:")
    mutation_selector = alt.param(name="MutationSelector", value=1, bind=radio)

    df_filtered = df
    # Group by 'site' and 'mutant', count the unique 'ab' values for each group
    grouped = df_filtered.groupby(["site", "mutant"])["ab"].nunique().reset_index()

    # Filter groups where the count of unique 'ab' values is at least 2
    result = grouped[grouped["ab"] >= 2]

    # Merge the result with the original dataframe to get the full rows
    df_result = pd.merge(df, result[["site", "mutant"]], on=["site", "mutant"])
    df_result["mutation_number"] = (
        df_result["mutation"].str.extract("(\d+)").astype(int)
    )
    base = (
        (
            alt.Chart(df_result, title=alt.Title("Shared antibody escape mutations"))
            .mark_rect()
            .encode(
                x=alt.X(
                    "mutation:O",
                    title="Site",
                    sort=alt.EncodingSortField(field="mutation_number"),
                    axis=alt.Axis(labelAngle=-90, grid=False),
                ),
                y=alt.Y(
                    "ab:O", title="Mutant", sort=order_ab, axis=alt.Axis(grid=False)
                ),  # Apply custom sort order here
                color="escape_mean",
                tooltip=[
                    "site",
                    "wildtype",
                    "mutant",
                    "escape_mean",
                    "min_mutations",
                ],
            )
        )
        .properties(height=200, width=400)
        .add_params(selector, mutation_selector)
        .transform_filter(
            (alt.datum.effect >= selector)
            & (alt.datum.min_mutations == mutation_selector)
        )
    )
    return base


# call function
overlap_escape = find_overlapping_escape(combined_df.query("top_escape == True"))
overlap_escape.display()
if mab_line_escape_plot is not None:
    overlap_escape.save(overlap_escape_plot)

def plot_line_escape(df):
    # Setup interactivity
    variant_selector = alt.selection_point(
        on="mouseover", empty=False, nearest=True, fields=["site"], value=0
    )
    # Group by 'site' and 'mutant', count the unique 'ab' values for each group
    summed = df.groupby(["site", "ab"])["escape_mean"].mean().reset_index().round(3)

    # Need to add dummy row because site 500 has been completely masked out due to low entry scores and not showing up on x-axis
    new_row = pd.DataFrame({"site": [500], "ab": ["HENV-103"], "escape_mean": [0]})
    summed = pd.concat([summed, new_row], ignore_index=True)

    empty_chart = []  # empty list for storing charts
    ab_list = ["m102.4", "HENV-117", "HENV-26", "HENV-103", "HENV-32", "nAH1.3"]
    for idx, ab in enumerate(ab_list):
        tmp_df = summed[summed["ab"] == ab]
        if ab in ["m102.4", "HENV-26", "HENV-117"]:
            color = "#1f4e79"
        if ab in ["HENV-103", "HENV-32"]:
            color = "#ff7f0e"
        if ab in ["nAH1.3"]:
            color = "#2ca02c"

        # Conditionally set the x-axis labels and title for the last plot
        is_last_plot = idx == len(ab_list) - 1
        x_axis = alt.Axis(
            values=[100, 200, 300, 400, 500, 600],
            tickCount=6,
            labelAngle=-90,
            grid=True,
            labelExpr="datum.value % 100 === 0 ? datum.value : ''",
            title="Site" if is_last_plot else None,
            labels=is_last_plot,
        )  # Only show labels for the last plot
        base = (
            alt.Chart(tmp_df)
            .mark_line(size=1, color=color)
            .encode(
                x=alt.X("site:O", axis=x_axis),
                y=alt.Y(
                    "escape_mean", title=f"{ab}", axis=alt.Axis(grid=False, tickCount=3)
                ),
            )
            .properties(width=800, height=100)
        )
        point = (
            base.mark_point(color="black", size=10, filled=True)
            .encode(
                x=alt.X("site:O", axis=x_axis),
                y=alt.Y(
                    "escape_mean",
                    title=f"{ab}",
                    axis=alt.Axis(
                        grid=False,
                    ),
                ),
                size=alt.condition(variant_selector, alt.value(100), alt.value(15)),
                color=alt.condition(
                    variant_selector, alt.value("black"), alt.value(color)
                ),
                tooltip=["site", "escape_mean"],
            )
            .add_params(variant_selector)
        )
        chart = base + point
        empty_chart.append(chart)

    combined_chart = (
        alt.vconcat(*empty_chart, spacing=1)
        .resolve_scale(y="independent", x="shared", color="independent")
        .properties(
            title=alt.Title(
                "Mean Antibody Escape by Site", subtitle="Colored by epitope"
            )
        )
    )

    return combined_chart


tmp_line = plot_line_escape(combined_df.query("escape_mean > 0"))
tmp_line.display()
if mab_line_escape_plot is not None:
    tmp_line.save(mab_line_escape_plot)

def calculate_min_distances(pdb_path, source_chain_id, target_chain_ids, name):
    # Initialize the PDB parser and load the structure
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure("structure_id", pdb_path)

    source_chain = structure[0][source_chain_id]
    target_chains = [structure[0][chain_id] for chain_id in target_chain_ids]

    data = []

    for residueA in source_chain:
        if residueA.resname in ["HOH", "WAT", "IPA", "NAG"]:
            continue

        min_distance = float("inf")
        closest_residueB = None
        closest_chain_id = None
        residues_within_4 = 0

        for target_chain in target_chains:
            for residueB in target_chain:
                if residueB.resname in ["HOH", "WAT", "IPA"]:
                    continue

                # Check for residues within 4 angstroms
                is_within_4 = False
                for atomA in residueA:
                    for atomB in residueB:
                        distance = atomA - atomB
                        if distance < min_distance:
                            min_distance = distance
                            closest_residueB = residueB
                            closest_chain_id = target_chain.get_id()
                        if distance < 4:
                            is_within_4 = True
                if is_within_4:
                    residues_within_4 += 1

        data.append(
            {
                "wildtype": residueA.resname,
                "site": residueA.id[1],
                "chain": closest_chain_id,
                "residue": closest_residueB.id[1],
                "residue_name": closest_residueB.resname,
                "distance": min_distance,
                "residues_within_4": residues_within_4,
                "ab": name,
            }
        )

    # Convert data to pandas DataFrame
    df = pd.DataFrame(data)
    return df


def check_file(input_path, source_chain, target_chain, name, output_path):

    file_path = output_path

    if not os.path.exists(file_path):
        print(f"File {name} does not exist, running calculation")
        output_df = calculate_min_distances(
            input_path, source_chain, target_chain, name
        )
        print(f"done calculating for {file_path}")
        output_df.to_csv(output_path, index=False)
        return output_df
    else:
        print("File already exists,loading from disk")
        output_df = pd.read_csv(output_path)
        return output_df


pdb_path_26 = "data/custom_analyses_data/crystal_structures/6vy5.pdb"
source_chain_26 = "A"
target_chains_26 = ["H", "L"]
output_path_26 = "results/distances/df_HENV26_atomic_distances.csv"

pdb_path_32 = "data/custom_analyses_data/crystal_structures/6vy4.pdb"
source_chain_32 = "A"
target_chains_32 = ["H", "L"]
output_path_32 = "results/distances/df_HENV32_atomic_distances.csv"

pdb_path_nah = "data/custom_analyses_data/crystal_structures/7txz.pdb"
source_chain_nah = "A"
target_chains_nah = ["F", "E"]
output_path_nah = "results/distances/df_nAH_atomic_distances.csv"

pdb_path_m102 = "data/custom_analyses_data/crystal_structures/6cmg.pdb"
source_chain_m102 = "A"
target_chains_m102 = ["B", "C"]
output_path_m102 = "results/distances/df_m102_atomic_distances.csv"

# Call distance functions
df_HENV26 = check_file(
    pdb_path_26, source_chain_26, target_chains_26, "HENV-26", output_path_26
)
df_HENV32 = check_file(
    pdb_path_32, source_chain_32, target_chains_32, "HENV-32", output_path_32
)
df_nah = check_file(
    pdb_path_nah, source_chain_nah, target_chains_nah, "nAH1.3", output_path_nah
)
df_nah["chain"].replace(
    {"E": "H", "F": "L"}, inplace=True
)  # Fix naming so consistent heavy and light chain naming
df_m102 = check_file(
    pdb_path_m102, source_chain_m102, target_chains_m102, "m102.4", output_path_m102
)
df_m102["chain"].replace(
    {"C": "H", "B": "L"}, inplace=True
)  # Fix naming so consistent heavy and light chain naming

File already exists,loading from disk
File already exists,loading from disk
File already exists,loading from disk
File already exists,loading from disk

def find_close_mab_sites(df, name):
    unique_sites = df.query("distance <= 4")["site"].unique()
    mab_site_list = list(unique_sites)
    print(f"Close sites for mAb {name} are: {mab_site_list}")
    return mab_site_list


### First find RBP sites that are close to mAb residues
nah_close = find_close_mab_sites(df_nah, "nAH1.3")
HENV26_close = find_close_mab_sites(df_HENV26, "HENV-26")
HENV32_close = find_close_mab_sites(df_HENV32, "HENV-32")
m102_close = find_close_mab_sites(df_m102, "m102.4")

### Now combined the close residues AND the top escape sites identified previously
nah_combined_sites = sites_dict["nAH1.3"] + nah_close
HENV26_combined_sites = sites_dict["HENV-26"] + HENV26_close
HENV32_combined_sites = sites_dict["HENV-32"] + HENV32_close
m102_combined_sites = sites_dict["m102.4"] + m102_close

Close sites for mAb nAH1.3 are: [172, 183, 184, 185, 186, 187, 188, 190, 191, 358, 449, 450, 451, 472, 515, 516, 517, 518, 570]
Close sites for mAb HENV-26 are: [389, 401, 403, 404, 458, 488, 489, 490, 491, 492, 494, 497, 501, 504, 505, 506, 528, 529, 530, 531, 532, 533, 555, 556, 557, 581, 586]
Close sites for mAb HENV-32 are: [196, 199, 200, 201, 202, 203, 205, 206, 207, 210, 254, 256, 258, 260, 262, 263, 264, 266, 553]
Close sites for mAb m102.4 are: [239, 240, 241, 242, 305, 458, 488, 489, 490, 504, 505, 506, 507, 530, 532, 533, 555, 557, 559, 579, 580, 581, 588]

def make_distance(df):
    subset_df = df[["site", "distance"]].copy()
    subset_df["mutant"] = "distance"
    subset_df["wildtype"] = ""
    subset_df["effect"] = "escape_mean"
    subset_df.rename(columns={"distance": "value"}, inplace=True)
    return subset_df


distance_nah_df = make_distance(df_nah)
distance_26_df = make_distance(df_HENV26)
distance_32_df = make_distance(df_HENV32)
distance_m102_df = make_distance(df_m102)

display(distance_m102_df)

def make_empty_df_with_distance(ab, distance_file):
    sites = range(71, 603)
    amino_acids = [
        "A",
        "C",
        "D",
        "E",
        "F",
        "G",
        "H",
        "I",
        "K",
        "L",
        "M",
        "N",
        "P",
        "Q",
        "R",
        "S",
        "T",
        "V",
        "W",
        "Y",
    ]
    # Create the combination of each site with each amino acid
    data = [{"site": site, "mutant": aa} for site in sites for aa in amino_acids]
    # Create the DataFrame
    empty_df = pd.DataFrame(data)
    all_sites_df = pd.merge(
        empty_df, combined_df.query(f'ab == "{ab}"'), on=["site", "mutant"], how="left"
    )
    df_melted = all_sites_df.melt(
        id_vars=["site", "mutant", "wildtype"],
        value_vars=["escape_mean"],
        var_name="effect",
        value_name="value",
    )

    df_filtered = func_scores_E3_low_effect.melt(
        id_vars=["site", "mutant", "wildtype"],
        value_vars=["effect"],
        var_name="effect",
        value_name="value",
    )

    df_test = pd.concat([df_melted, df_filtered, distance_file], ignore_index=True)
    df_test["ab"] = ab
    return df_test


# Call functions
empty_df_m102 = make_empty_df_with_distance("m102.4", distance_m102_df)
empty_df_HENV26 = make_empty_df_with_distance("HENV-26", distance_26_df)
empty_df_HENV32 = make_empty_df_with_distance("HENV-32", distance_32_df)
empty_df_nah = make_empty_df_with_distance("nAH1.3", distance_nah_df)


def make_empty_df(ab):
    sites = range(71, 603)
    amino_acids = [
        "A",
        "C",
        "D",
        "E",
        "F",
        "G",
        "H",
        "I",
        "K",
        "L",
        "M",
        "N",
        "P",
        "Q",
        "R",
        "S",
        "T",
        "V",
        "W",
        "Y",
    ]
    # Create the combination of each site with each amino acid
    data = [{"site": site, "mutant": aa} for site in sites for aa in amino_acids]
    # Create the DataFrame
    empty_df = pd.DataFrame(data)
    all_sites_df = pd.merge(
        empty_df, combined_df.query(f'ab == "{ab}"'), on=["site", "mutant"], how="left"
    )
    df_melted = all_sites_df.melt(
        id_vars=["site", "mutant", "wildtype"],
        value_vars=["escape_mean"],
        var_name="effect",
        value_name="value",
    )

    df_filtered = func_scores_E3_low_effect.melt(
        id_vars=["site", "mutant", "wildtype"],
        value_vars=["effect"],
        var_name="effect",
        value_name="value",
    )

    df_test = pd.concat([df_melted, df_filtered], ignore_index=True)
    df_test["ab"] = ab
    return df_test


# call functions
empty_df_HENV117 = make_empty_df("HENV-117")
empty_df_HENV103 = make_empty_df("HENV-103")

combined_ab = pd.concat(
    [
        empty_df_m102,
        empty_df_HENV26,
        empty_df_HENV32,
        empty_df_nah,
        empty_df_HENV117,
        empty_df_HENV103,
    ]
)
display(combined_ab)

def plot_distance_only(df, trigger):
    custom_order = [
        "distance",
        "R",
        "K",
        "H",
        "D",
        "E",
        "Q",
        "N",
        "S",
        "T",
        "Y",
        "W",
        "F",
        "A",
        "I",
        "L",
        "M",
        "V",
        "G",
        "P",
        "C",
    ]
    all_residues = range(71, 603)
    final_df = df
    final_df = final_df.sort_values(
        "site"
    )  # Sort the dataframe by 'site' to ensure that duplicates are detected correctly.
    sort_order = {
        mutant: i for i, mutant in enumerate(custom_order)
    }  # Create a dictionary that maps each mutant to its sort rank based on the custom order
    final_df["mutant_rank"] = final_df["mutant"].map(
        sort_order
    )  # Map the 'mutant' column to these ranks

    final_df = final_df.sort_values(
        "mutant_rank"
    )  # Now sort the dataframe by this rank
    final_df = final_df.drop(
        columns=["mutant_rank"]
    )  # Drop the 'mutant_rank' column as it is no longer needed after sorting
    sites = sorted(final_df["site"].unique(), key=lambda x: float(x))
    ab_list = ["m102.4", "HENV-117", "HENV-26", "HENV-103", "HENV-32", "nAH1.3"]
    empty_chart = []  # setup collection for charts
    for idx, ab in enumerate(ab_list):
        tmp_df = final_df[final_df["ab"] == ab]
        if ab == "m102.4":
            site_subset = m102_combined_sites
        if ab == "HENV-26":
            site_subset = HENV26_combined_sites
        if ab == "HENV-32":
            site_subset = HENV32_combined_sites
        if ab == "HENV-103":
            site_subset = sites_dict["HENV-103"]
        if ab == "HENV-117":
            site_subset = sites_dict["HENV-117"]
        if ab == "nAH1.3":
            site_subset = nah_combined_sites

        # select which sites you will show
        if trigger == True:
            tmp_df = tmp_df[tmp_df["site"].isin(site_subset)]
            x_axis = alt.Axis(
                labelAngle=-90,
                title="Site",
            )
        else:
            tmp_df = tmp_df[tmp_df["site"].isin(all_residues)]

            # Conditionally set the x-axis labels and title for the last plot
            is_last_plot = idx == len(ab_list) - 1
            x_axis = alt.Axis(
                labelAngle=-90,
                labelExpr="datum.value % 10 === 0 ? datum.value : ''",
                title="Site" if is_last_plot else None,
                labels=True,
            )  # Only show labels for the last plot

        # Prepare the color scales separately for distance and effects
        # Filter out 'distance' values before creating the effect heatmap
        effect_df = tmp_df[
            (tmp_df["mutant"] != "distance") & (tmp_df["effect"] != "effect")
        ]
        max_color = effect_df["value"].max()
        min_color = effect_df["value"].min()

        # Adjust color scheme for abs with little sensitizing mutations
        if min_color > -1:
            min_color = min_color - 1

        # Prepare the color scale for effects
        color_scale_escape = alt.Scale(
            scheme="redblue", domainMid=0, domain=[min_color, max_color]
        )
        color_scale_entropy = alt.Scale(scheme="purples", domain=[0, 15], reverse=True)

        strokewidth_size = 0.5

        unique_wildtypes_df = tmp_df.drop_duplicates(subset=["site", "wildtype"])

        # The chart for the heatmap
        base = (
            alt.Chart(tmp_df, title=f"{ab}")
            .encode(
                x=alt.X("site:O", title="Site", sort=sites, axis=x_axis),
                y=alt.Y(
                    "mutant",
                    title="Amino Acid",
                    sort=alt.EncodingSortField(field="sort_order", order="ascending"),
                    axis=alt.Axis(grid=False),
                ),  # Apply custom sort order here
                tooltip=["site", "wildtype", "mutant", "value"],
            )
            .properties(width=alt.Step(12), height=alt.Step(12))
        )
        # Heatmap for distance
        chart_empty = (
            base.mark_rect(color="#e6e7e8")
            .encode()
            .transform_filter(alt.datum.effect == "escape_mean")
        )
        # Heatmap for effect
        chart_effect = (
            base.mark_rect(stroke="black", strokeWidth=strokewidth_size)
            .encode(
                color=alt.condition(
                    'datum.mutant != "distance"',
                    alt.Color(
                        "value:Q",
                        scale=color_scale_escape,
                        legend=alt.Legend(title=f"{ab} Escape"),
                    ),
                    alt.value("transparent"),
                ),
            )
            .transform_filter(alt.datum.effect == "escape_mean")
        )

        # Heatmap for distance
        if ab in ["m102.4", "HENV-26", "HENV-32", "nAH1.3"]:
            chart_distance = (
                base.mark_rect()
                .encode(
                    color=alt.condition(
                        'datum.mutant == "distance"',
                        alt.Color(
                            "value:Q",
                            scale=color_scale_entropy,
                            legend=alt.Legend(title="Distance to mAb"),
                        ),
                        alt.value("transparent"),
                    )
                )
                .transform_filter(alt.datum.effect == "escape_mean")
            )
        else:
            chart_distance = (
                base.mark_rect(color="transparent")
                .encode()
                .transform_filter(alt.datum.effect == "escape_mean")
            )
        # Heatmap for distance
        chart_filtered = (
            base.mark_rect(
                color="#939598", stroke="black", strokeWidth=strokewidth_size
            )
            .encode()
            .transform_filter(alt.datum.effect == "effect")
        )

        # The layer for the wildtype boxes
        wildtype_layer_box = (
            alt.Chart(unique_wildtypes_df)
            .mark_rect(color="white", stroke="black", strokeWidth=strokewidth_size)
            .encode(
                x=alt.X("site:O", sort=sites),
                y=alt.Y(
                    "wildtype",
                    sort=alt.EncodingSortField(field="sort_order", order="ascending"),
                ),
                opacity=alt.value(1),
            )
            .transform_filter(
                (alt.datum.wildtype != "")
                & (alt.datum.wildtype != None)
                & (alt.datum.value != None)
            )
        )
        # The layer for the wildtype amino acids
        wildtype_layer = (
            alt.Chart(unique_wildtypes_df)
            .mark_text(color="black", text="X", size=8)
            .encode(
                x=alt.X("site:O", sort=sites),
                y=alt.Y(
                    "wildtype",
                    sort=alt.EncodingSortField(field="sort_order", order="ascending"),
                ),
                opacity=alt.value(1),
            )
            .transform_filter(
                (alt.datum.wildtype != "")
                & (alt.datum.wildtype != None)
                & (alt.datum.value != None)
            )
        )

        # Combine the heatmap layer with the wildtype layer
        chart = alt.layer(
            chart_empty,
            chart_effect,
            chart_distance,
            chart_filtered,
            wildtype_layer_box,
            wildtype_layer,
        ).resolve_scale(color="independent")
        empty_chart.append(chart)
    combined_chart = (
        alt.vconcat(*empty_chart, spacing=1)
        .resolve_scale(y="shared", x="independent", color="independent")
        .configure_title(
            anchor="start",  # Aligns the title to the left ('middle' for center, 'end' for right)
            offset=10,  # Adjusts the distance of the title from the chart
            orient="top",  # Positions the title at the top; use 'bottom' to position at the bottom
        )
    )
    return combined_chart


mab_plot = plot_distance_only(combined_ab, True)
mab_plot.display()
if mab_line_escape_plot is not None:
    mab_plot.save(mab_plot_top)

mab_all = plot_distance_only(combined_ab, False)
mab_all.display()
if mab_line_escape_plot is not None:
    mab_all.save(mab_plot_all)

# bind_df is ephrin-b2 filtered binding data
binding_df = bind_df.groupby("site")["binding_mean"].mean().reset_index()


def make_empty_binding():
    sites = range(71, 603)
    data = [{"site": site} for site in sites]
    empty_df = pd.DataFrame(data)
    empty_df = pd.merge(empty_df, binding_df, on="site", how="left")
    empty_df = empty_df.rename(columns={"binding_mean": "value"})
    empty_df["effect"] = "escape_mean"
    empty_df["ab"] = "Ephrin-B2 binding"
    return empty_df


binding_empty = make_empty_binding()

escape_df = combined_df.groupby(["ab", "site"])["escape_mean"].mean().reset_index()


def make_empty_df(ab):
    sites = range(71, 603)
    data = [{"site": site} for site in sites]

    # Create the DataFrame
    empty_df = pd.DataFrame(data)

    all_sites_df = pd.merge(
        empty_df, escape_df.query(f'ab == "{ab}"'), on=["site"], how="left"
    )

    df_melted = all_sites_df.melt(
        id_vars=["site"],
        value_vars=["escape_mean"],
        var_name="effect",
        value_name="value",
    )

    df_test = pd.concat([df_melted], ignore_index=True)
    df_test["ab"] = ab
    return df_test


ab_list = ["m102.4", "HENV-26", "HENV-117", "HENV-103", "HENV-32", "nAH1.3"]

empty = []  # empty container to hold dfs
for ab in ab_list:
    tmp_df = make_empty_df(ab)
    empty.append(tmp_df)
all_empties_df = pd.concat(empty, ignore_index=True)
all_empties_df = pd.concat([all_empties_df, binding_empty])
display(all_empties_df)

def make_heatmap_with_binding(df):
    # Define the custom sort order directly in the encoding
    sort_order = [
        "NiV Polymorphism",
        "Ephrin-B2 binding",
        "m102.4",
        "HENV-26",
        "HENV-117",
        "HENV-103",
        "HENV-32",
        "nAH1.3",
    ]
    full_ranges = [
        list(range(start, end))
        for start, end in [(71, 181), (181, 291), (291, 401), (401, 511), (511, 603)]
    ]

    # container to hold the charts
    charts = []
    # define color scheme
    color_scale_effect = alt.Scale(scheme="redblue", domainMid=0)
    color_scale_binding = alt.Scale(scheme="redblue", domainMid=0)

    for idx, subset in enumerate(full_ranges):
        subset_df = df[df["site"].isin(subset)]  # for the wrapping of sites
        is_last_plot = idx == len(full_ranges) - 1
        x_axis = alt.Axis(
            labelAngle=-90,
            labelExpr="datum.value % 10 === 0 ? datum.value : ''",
            title="Site" if is_last_plot else None,
            labels=True,
        )  # Only show labels for the last plot

        effect_legend = alt.Legend(title="Antibody Escape") if is_last_plot else None
        binding_legend = (
            alt.Legend(title="Henipavirus Entropy") if is_last_plot else None
        )

        base = (
            alt.Chart(subset_df)
            .encode(
                x=alt.X("site:O", title="Site", axis=x_axis),
                y=alt.Y(
                    "ab", title=None, sort=sort_order, axis=alt.Axis(grid=False)
                ),  # Correctly apply custom sort order
                tooltip=["site", "value"],
            )
            .properties(width=alt.Step(10), height=alt.Step(11))
        )

        # Define the chart for empty cells
        chart_empty = base.mark_rect(color="#e6e7e8").transform_filter(
            alt.datum.effect == "escape_mean"
        )

        # Define the chart for cells with effect
        chart_effect = (
            base.mark_rect(stroke="black", strokeWidth=0.25)
            .encode(
                color=alt.condition(
                    'datum.effect == "escape_mean"',
                    alt.Color(
                        "value:Q", scale=color_scale_effect, legend=effect_legend
                    ),  # Define a color scale
                    alt.value("transparent"),
                )
            )
            .transform_filter(alt.datum.effect == "escape_mean")
        )

        chart_binding = (
            base.mark_rect(strokeWidth=1.1)
            .encode(
                stroke=alt.value("value"),
                color=alt.condition(
                    'datum.effect == "escape_mean"',
                    alt.Color(
                        "value:Q", scale=color_scale_binding, legend=binding_legend
                    ),
                    alt.value("transparent"),
                ),
            )
            .transform_filter(alt.datum.ab == "Ephrin-B2 binding")
        )

        chart_poly = (
            base.mark_rect(color="black")
            .encode()
            .transform_filter(alt.datum.ab == "NiV Polymorphism")
        )
        # Layer the charts
        chart = alt.layer(chart_empty, chart_effect, chart_binding, chart_poly)
        charts.append(chart)
    combined_chart = alt.vconcat(
        *charts, spacing=5, title="Heatmap of median mAb escape and Ephrin-B2 binding"
    )

    return combined_chart


# call function
chart = make_heatmap_with_binding(all_empties_df)
chart.display()
if mab_line_escape_plot is not None:
    chart.save(aggregate_mab_and_binding)

def make_contact():
    df = pd.DataFrame({"site": niv_poly, "contact": [0.0] * len(niv_poly)})
    df = df[["site", "contact"]]
    df["ab"] = "NiV Polymorphism"
    df["effect"] = "escape_mean"
    df.rename(columns={"contact": "value"}, inplace=True)
    return df


niv_poly = config["nipah_poly"]
contact_df = make_contact()

binding_df = bind_df.groupby("site")["binding_mean"].max().reset_index()


def make_empty_binding():
    sites = range(71, 603)
    data = [{"site": site} for site in sites]
    empty_df = pd.DataFrame(data)
    empty_df = pd.merge(empty_df, binding_df, on="site", how="left")
    empty_df = empty_df.rename(columns={"binding_mean": "value"})
    empty_df["effect"] = "escape_mean"
    empty_df["ab"] = "Ephrin-B2 binding"
    return empty_df


binding_empty = make_empty_binding()

escape_df = combined_df.groupby(["ab", "site"])["escape_mean"].max().reset_index()


def make_empty_df(ab):
    sites = range(71, 603)
    data = [{"site": site} for site in sites]

    # Create the DataFrame
    empty_df = pd.DataFrame(data)

    all_sites_df = pd.merge(
        empty_df, escape_df.query(f'ab == "{ab}"'), on=["site"], how="left"
    )

    df_melted = all_sites_df.melt(
        id_vars=["site"],
        value_vars=["escape_mean"],
        var_name="effect",
        value_name="value",
    )

    df_test = pd.concat([df_melted], ignore_index=True)
    df_test["ab"] = ab
    return df_test


ab_list = ["m102.4", "HENV-26", "HENV-117", "HENV-103", "HENV-32", "nAH1.3"]

empty = []
for ab in ab_list:
    tmp_df = make_empty_df(ab)
    empty.append(tmp_df)
all_empties_df = pd.concat(empty, ignore_index=True)
all_empties_df = pd.concat([all_empties_df, contact_df])
display(all_empties_df)

def make_heatmap_with_polymorphisms(df):
    # Define the custom sort order directly in the encoding
    sort_order = [
        "NiV Polymorphism",
        "m102.4",
        "HENV-26",
        "HENV-117",
        "HENV-103",
        "HENV-32",
        "nAH1.3",
    ]
    full_ranges = [
        list(range(start, end))
        for start, end in [(71, 181), (181, 291), (291, 401), (401, 511), (511, 603)]
    ]

    # container to hold the charts
    charts = []
    color_scale_effect = alt.Scale(scheme="redblue", domainMid=0, domain=[0, 2])
    color_scale_binding = alt.Scale(scheme="redblue", domainMid=0, domain=[-5, 2])

    # Flags for showing the legend only the first time
    effect_legend_added = True
    binding_legend_added = True
    for idx, subset in enumerate(full_ranges):
        subset_df = df[df["site"].isin(subset)]  # for the wrapping of sites
        is_last_plot = idx == len(full_ranges) - 1
        x_axis = alt.Axis(
            labelAngle=-90,
            labelExpr="datum.value % 10 === 0 ? datum.value : ''",
            title="Site" if is_last_plot else None,
            labels=True,
        )  # Only show labels for the last plot

        base = (
            alt.Chart(subset_df)
            .encode(
                x=alt.X("site:O", title="Site", axis=x_axis),
                y=alt.Y(
                    "ab", title=None, sort=sort_order, axis=alt.Axis(grid=False)
                ),  # Correctly apply custom sort order
                tooltip=["site", alt.Tooltip("value", format=".2f")],
            )
            .properties(width=alt.Step(10), height=alt.Step(11))
        )

        # Define the chart for empty cells
        chart_empty = base.mark_rect(color="#e6e7e8").transform_filter(
            alt.datum.effect == "escape_mean"
        )
        if not effect_legend_added:
            # Define the chart for cells with effect
            chart_effect = (
                base.mark_rect(stroke="black", strokeWidth=0.25)
                .encode(
                    color=alt.condition(
                        'datum.effect == "escape_mean"',
                        alt.Color(
                            "value:Q", scale=color_scale_effect
                        ),  # Define a color scale
                        alt.value("transparent"),
                    )
                )
                .transform_filter(alt.datum.effect == "escape_mean")
            )
            effect_legend_added = True
        else:
            # Define the chart for cells with effect
            chart_effect = (
                base.mark_rect(stroke="black", strokeWidth=0.25)
                .encode(
                    color=alt.condition(
                        'datum.effect == "escape_mean"',
                        alt.Color(
                            "value:Q", scale=color_scale_effect, legend=None
                        ),  # Define a color scale
                        alt.value("transparent"),
                    )
                )
                .transform_filter(alt.datum.effect == "escape_mean")
            )
        if not binding_legend_added:
            chart_binding = (
                base.mark_rect(strokeWidth=1.1)
                .encode(
                    stroke=alt.value("value"),
                    color=alt.condition(
                        'datum.effect == "escape_mean"',
                        alt.Color("value:Q", scale=color_scale_binding),
                        alt.value("transparent"),
                    ),
                )
                .transform_filter(alt.datum.ab == "Ephrin-B2 binding")
            )
            binding_legend_added = True
        else:
            chart_binding = (
                base.mark_rect(strokeWidth=1.1)
                .encode(
                    stroke=alt.value("value"),
                    color=alt.condition(
                        'datum.effect == "escape_mean"',
                        alt.Color("value:Q", scale=color_scale_binding, legend=None),
                        alt.value("transparent"),
                    ),
                )
                .transform_filter(alt.datum.ab == "Ephrin-B2 binding")
            )

        chart_poly = (
            base.mark_rect(color="black")
            .encode()
            .transform_filter(alt.datum.ab == "NiV Polymorphism")
        )
        # Layer the charts
        chart = alt.layer(chart_empty, chart_effect, chart_poly).resolve_scale(
            color="independent"
        )
        charts.append(chart)
    combined_chart = alt.vconcat(
        *charts, spacing=5, title="Heatmap of max mAb escape and Nipah Polymorphisms"
    ).resolve_scale(y="shared", x="independent", color="shared")

    return combined_chart


# Call function
chart = make_heatmap_with_polymorphisms(all_empties_df)
chart.display()
if mab_line_escape_plot is not None:
    chart.save(aggregate_mab_and_niv_polymorphism)

def make_polymorphism_escape(df, sites_list, title_name):
    df["is_poly"] = df["site"].isin(sites_list)

    aggregated_df = (
        df.groupby(["site", "ab"])
        .agg({"escape_mean": "mean", "wildtype": "first", "is_poly": "first"})
        .reset_index()
    )

    aggregated_df["is_poly"] = aggregated_df["is_poly"].map(
        {True: "Polymorphic", False: "Conserved"}
    )

    empty_chart = []

    antibodies = ["m102.4", "HENV-117", "HENV-26", "HENV-103", "HENV-32", "nAH1.3"]

    base = (
        alt.Chart(aggregated_df.query("escape_mean > 0"), title=title_name)
        .mark_point(size=75, opacity=1, filled=True, strokeWidth=0.75, stroke="black")
        .encode(
            y=alt.X("is_poly", title=None, axis=alt.Axis(grid=False)),
            yOffset="random:Q",
            x=alt.Y(
                "escape_mean",
                title="Mean antibody escape",
                axis=alt.Axis(grid=False, tickCount=3),
            ),
            tooltip=["site"],
            color=alt.Color("is_poly", title="Sequence Conservation"),
            row=alt.Row("ab", sort=antibodies, title=None),
        )
        .transform_calculate(random="sqrt(-2*log(random()))*cos(2*PI*random())")
        .properties(width=300, height=30)
    )

    return base


nipah_poly_chart = make_polymorphism_escape(
    combined_df, config["nipah_poly"], "Antibody Escape for Polymorphic Nipah Sites"
)
nipah_poly_chart.display()

hendra_poly_chart = make_polymorphism_escape(
    combined_df,
    config["hendra_poly"],
    "Antibody Escape for Sites Different in Hendra virus",
)
hendra_poly_chart.display()

combined_poly_plots = (
    alt.hconcat(nipah_poly_chart, hendra_poly_chart, padding=50)
    .configure_header(
        labelFontSize=14,
        labelAngle=0,
        labelAlign="left",
        labelFont="Helvetica Light",
        labelFontStyle="bold",
        labelPadding=3,
    )
    .configure_axisY(labels=False)
)
combined_poly_plots.display()
if mab_line_escape_plot is not None:
    combined_poly_plots.save(combined_evol_sites_escape)

	site	value	mutant	wildtype	effect
0	176	35.044434	distance		escape_mean
1	177	31.866014	distance		escape_mean
2	178	27.842815	distance		escape_mean
3	179	28.777035	distance		escape_mean
4	180	28.332012	distance		escape_mean
...	...	...	...	...	...
423	599	30.802711	distance		escape_mean
424	600	28.920950	distance		escape_mean
425	601	27.248772	distance		escape_mean
426	602	29.020868	distance		escape_mean
427	603	27.945621	distance		escape_mean

analyze_escape_data.ipynb¶

Script to read in filtered antibody escape data, analyze stats, make logo plots, and make altair figures for paper.¶

Import python packages, set working directory, import altair theme¶

For running notebook interactively¶

Load config files¶

Load data¶

Find the top escape sites and make lists for different antibody subsets¶

make dataframes for making logo plots¶

Make the faceted logo plots¶

Generate a logo plot specifically for opposing muts between receptor binding face and dimerization face¶

Find the top sites of escape¶

Make a dictionary of top sites of escape for plotting later¶

Plot bubble chart showing mAb escape for individual mutants by functional score for both E2 or E3¶

Make heatmap of overlapping escape sites between two antibodies¶

Make plot of mean escape by site¶

Now calculate atomic distances between escape sites and closest amino acid in heavy and light chains¶

Find RBP sites that are within 4 angstroms of an antibody residue¶

Make antibody escape heatmaps¶

Make dataframes compatible with heatmaps¶

Prepare dataframes for heatmaps¶

Make heatmaps¶

Make full antibody escape heatmaps¶

Now make heatmaps of antibody escape versus Ephrin Binding¶

Now show heatmap with nipah polymorphisms¶

Make a plot comparing escape at Hendra and Nipah substitutions¶

	site	mutant	wildtype	effect	value	ab
0	71	A	NaN	escape_mean	NaN	m102.4
1	71	C	Q	escape_mean	0.290000	m102.4
2	71	D	Q	escape_mean	-0.021530	m102.4
3	71	E	Q	escape_mean	0.017950	m102.4
4	71	F	Q	escape_mean	0.004024	m102.4
...	...	...	...	...	...	...
13305	597	T	I	effect	-3.261000	HENV-103
13306	598	C	P	effect	-2.075000	HENV-103
13307	598	I	P	effect	-2.012000	HENV-103
13308	598	W	P	effect	-3.040000	HENV-103
13309	601	G	C	effect	-2.047000	HENV-103

	site	effect	value	ab
0	71	escape_mean	0.064496	m102.4
1	72	escape_mean	0.006840	m102.4
2	73	escape_mean	0.058987	m102.4
3	74	escape_mean	-0.031857	m102.4
4	75	escape_mean	0.004129	m102.4
...	...	...	...	...
527	598	escape_mean	0.160900	Ephrin-B2 binding
528	599	escape_mean	-0.176051	Ephrin-B2 binding
529	600	escape_mean	-0.081067	Ephrin-B2 binding
530	601	escape_mean	0.338929	Ephrin-B2 binding
531	602	escape_mean	-0.020847	Ephrin-B2 binding

	site	effect	value	ab
0	71	escape_mean	0.42320	m102.4
1	72	escape_mean	0.09214	m102.4
2	73	escape_mean	0.33640	m102.4
3	74	escape_mean	0.06910	m102.4
4	75	escape_mean	0.11410	m102.4
...	...	...	...	...
27	478	escape_mean	0.00000	NiV Polymorphism
28	481	escape_mean	0.00000	NiV Polymorphism
29	498	escape_mean	0.00000	NiV Polymorphism
30	502	escape_mean	0.00000	NiV Polymorphism
31	545	escape_mean	0.00000	NiV Polymorphism