# Imports
import os
import warnings
import dmslogo
import neutcurve
import numpy as np
import scipy as sp
import pandas as pd
import altair as alt
import seaborn as sns
import matplotlib.colors
from Bio import SeqIO, AlignIO 
from matplotlib import pyplot as plt
from matplotlib import ticker as mticker

# Plotting colors
tol_muted_adjusted = [
    "#000000",
    "#CC6677", 
    "#1f78b4", 
    "#DDCC77", 
    "#117733", 
    "#882255", 
    "#88CCEE",
    "#44AA99", 
    "#999933", 
    "#AA4499", 
    "#EE7733",
    "#CC3311",
    "#DDDDDD",
]

# Create color palette
def color_gradient_hex(start, end, n):
    """Color function from polyclonal"""
    cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
            name="_", colors=[start, end], N=n
        )
    return [matplotlib.colors.rgb2hex(tup) for tup in cmap(list(range(0, n)))]

# Seaborn style settings
sns.set(rc={
    "figure.dpi":300, 
    "savefig.dpi":300,
    "svg.fonttype":"none",
})
sns.set_style("ticks")
sns.set_palette(tol_muted_adjusted)

# Suppress warnings
warnings.simplefilter("ignore")

# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

# this cell is tagged as `parameters` for papermill parameterization
filtered_escape_377H = None
filtered_escape_89F = None
filtered_escape_2510C = None
filtered_escape_121F = None
filtered_escape_256A = None
filtered_escape_372D = None

contacts_89F = None
contacts_377H = None
contacts_256A = None
contacts_2510C = None
contacts_121F = None
contacts_372D = None

func_scores = None
min_times_seen = None
n_selections = None

natural_sequence_variation = None
natural_GPC_sequence_alignment = None

fraction_infected_natural_isolates = None
out_dir = None
neuts_image_path = None
neuts_256A_image_path = None
corr_image_path = None
corr_256A_image_path = None

out_dir_escape = None
escape_top10_image_path = None
escape_all_image_path = None
escape_all_256A_image_path = None

out_dir_natural = None
natural_escape = None

html_nat_mut_freqs_vs_escape = None
html_nat_mut_freqs_vs_escape_all_abs = None
html_natural_vs_escape = None
html_natural_vs_escape_all_abs = None
html_arevirumab_comparisons = None

# Parameters
filtered_escape_377H = (
    "results/filtered_antibody_escape_CSVs/377H_filtered_mut_effect.csv"
)
filtered_escape_89F = (
    "results/filtered_antibody_escape_CSVs/89F_filtered_mut_effect.csv"
)
filtered_escape_2510C = (
    "results/filtered_antibody_escape_CSVs/2510C_filtered_mut_effect.csv"
)
filtered_escape_121F = (
    "results/filtered_antibody_escape_CSVs/121F_filtered_mut_effect.csv"
)
filtered_escape_256A = (
    "results/filtered_antibody_escape_CSVs/256A_filtered_mut_effect.csv"
)
filtered_escape_372D = (
    "results/filtered_antibody_escape_CSVs/372D_filtered_mut_effect.csv"
)
contacts_89F = "data/antibody_contacts/antibody_contacts_89F.csv"
contacts_377H = "data/antibody_contacts/antibody_contacts_377H.csv"
contacts_256A = "data/antibody_contacts/antibody_contacts_256A.csv"
contacts_2510C = "data/antibody_contacts/antibody_contacts_2510C.csv"
contacts_121F = "data/antibody_contacts/antibody_contacts_121F.csv"
contacts_372D = "data/antibody_contacts/antibody_contacts_372D.csv"
func_scores = "results/func_effects/averages/293T_entry_func_effects.csv"
natural_sequence_variation = (
    "non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_protein_variation.csv"
)
natural_GPC_sequence_alignment = "non-pipeline_analyses/LASV_phylogeny_analysis/Results/LASV_GPC_protein_alignment.fasta"
fraction_infected_natural_isolates = (
    "data/validation_frac_infected_natural_isolates.csv"
)
min_times_seen = 2
n_selections = 8
out_dir = "results/validation_plots/"
out_dir_escape = "results/antibody_escape_profiles/"
out_dir_natural = "results/natural_isolate_escape/"
neuts_image_path = (
    "results/validation_plots/validation_neut_curves_natural_isolates.svg"
)
neuts_256A_image_path = (
    "results/validation_plots/validation_neut_256A_curves_natural_isolates.svg"
)
corr_image_path = "results/validation_plots/natural_isolate_validation_correlation.svg"
corr_256A_image_path = (
    "results/validation_plots/natural_isolate_validation_256A_correlation.svg"
)
escape_top10_image_path = (
    "results/antibody_escape_profiles/natural_isolate_top10_escape_profiles.svg"
)
escape_all_image_path = (
    "results/antibody_escape_profiles/natural_isolate_all_escape_profiles.svg"
)
escape_all_256A_image_path = (
    "results/antibody_escape_profiles/natural_isolate_all_256A_escape_profiles.svg"
)
natural_escape = "results/natural_isolate_escape/natural_isolate_escape.svg"
html_nat_mut_freqs_vs_escape = (
    "results/natural_isolate_escape/nat_mut_freqs_vs_escape.html"
)
html_nat_mut_freqs_vs_escape_all_abs = (
    "results/natural_isolate_escape/nat_mut_freqs_vs_escape_all_abs.html"
)
html_natural_vs_escape = "results/natural_isolate_escape/natural_vs_escape.html"
html_natural_vs_escape_all_abs = (
    "results/natural_isolate_escape/natural_vs_escape_all_abs.html"
)
html_arevirumab_comparisons = (
    "results/antibody_escape_profiles/arevirumab_comparisons.html"
)

# # Uncomment for running interactive
# filtered_escape_377H = "../results/filtered_antibody_escape_CSVs/377H_filtered_mut_effect.csv"
# filtered_escape_89F = "../results/filtered_antibody_escape_CSVs/89F_filtered_mut_effect.csv"
# filtered_escape_2510C = "../results/filtered_antibody_escape_CSVs/2510C_filtered_mut_effect.csv"
# filtered_escape_121F = "../results/filtered_antibody_escape_CSVs/121F_filtered_mut_effect.csv"
# filtered_escape_256A = "../results/filtered_antibody_escape_CSVs/256A_filtered_mut_effect.csv"
# filtered_escape_372D = "../results/filtered_antibody_escape_CSVs/372D_filtered_mut_effect.csv"

# contacts_89F = "../data/antibody_contacts/antibody_contacts_89F.csv"
# contacts_377H = "../data/antibody_contacts/antibody_contacts_377H.csv"
# contacts_256A = "../data/antibody_contacts/antibody_contacts_256A.csv"
# contacts_2510C = "../data/antibody_contacts/antibody_contacts_2510C.csv"
# contacts_121F = "../data/antibody_contacts/antibody_contacts_121F.csv"
# contacts_372D = "../data/antibody_contacts/antibody_contacts_372D.csv"

# func_scores = "../results/func_effects/averages/293T_entry_func_effects.csv"
# min_times_seen = 2
# n_selections = 8

# natural_sequence_variation = "../non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_protein_variation.csv"
# natural_GPC_sequence_alignment = "../non-pipeline_analyses/LASV_phylogeny_analysis/Results/LASV_GPC_protein_alignment.fasta"

# fraction_infected_natural_isolates = "../data/validation_frac_infected_natural_isolates.csv"
# out_dir = "../results/validation_plots/"
# neuts_image_path = "../results/validation_plots/validation_neut_curves_natural_isolates.svg"
# neuts_256A_image_path = "../results/validation_plots/validation_neut_256A_curves_natural_isolates.svg"
# corr_image_path = "../results/validation_plots/natural_isolate_validation_correlation.svg"
# corr_256A_image_path = "../results/validation_plots/natural_isolate_validation_256A_correlation.svg"

# out_dir_escape = "../results/antibody_escape_profiles/"
# escape_top10_image_path = "../results/antibody_escape_profiles/natural_isolate_top10_escape_profiles.svg"
# escape_all_image_path = "../results/antibody_escape_profiles/natural_isolate_all_escape_profiles.svg"
# escape_all_256A_image_path = "../results/antibody_escape_profiles/natural_isolate_all_256A_escape_profiles.svg"

# out_dir_natural = "../results/natural_isolate_escape/"
# natural_escape = "../results/natural_isolate_escape/natural_isolate_escape.svg"

# html_nat_mut_freqs_vs_escape = "../results/natural_isolate_escape/nat_mut_freqs_vs_escape.html"
# html_nat_mut_freqs_vs_escape_all_abs = "../results/natural_isolate_escape/nat_mut_freqs_vs_escape_all_abs.html"
# html_natural_vs_escape = "../results/natural_isolate_escape/natural_vs_escape.html"
# html_natural_vs_escape_all_abs = "../results/natural_isolate_escape/natural_vs_escape_all_abs.html"
# html_arevirumab_comparisons = "../results/antibody_escape_profiles/arevirumab_comparisons.html"

def determine_escape(percentile_escape, sequence, escape_file, strain, print_results=False, strong_escape_site=False):
    """
    Function that determines if a sequence contains any 
    escape mutations given a percentile cutoff.
    """

    antibody_name = escape_file.split("/")[-1].split("_")[0]
    
    # Load data as dataframe
    escape_df = pd.read_csv(escape_file)

    # Filter escape df for low functional score mutations
    escape_df = escape_df.query("poor_cell_entry == False")

    # Clip lower scores to 0
    escape_df["escape_median"] = escape_df["escape_median"].clip(lower=0)

    # Get muts for top escape
    cutoff = escape_df["escape_median"].quantile(percentile_escape)
    top_escape_muts = (
        tuple(zip(
            escape_df.loc[escape_df["escape_median"] >= cutoff]["mutation"].tolist(),
            escape_df.loc[escape_df["escape_median"] >= cutoff]["escape_median"].tolist(),
        ))
    )

    # Calculate summed escape sites and get sites for top escape
    # either by calculating x-fold median or top percentile
    escape_df = (
        escape_df
        .groupby("site")
        .aggregate({"escape_median" : "sum"})
        .reset_index()
    )
    if strong_escape_site == True:
        cutoff = escape_df["escape_median"].median() * 5
    else:
        cutoff = escape_df["escape_median"].quantile(percentile_escape)
    top_escape_sites = escape_df.loc[escape_df["escape_median"] >= cutoff]["site"].tolist()

    # Filter top escape muts bases on top escape sites
    top_escape_muts = [x for x in top_escape_muts if int(x[0][1:-1]) in top_escape_sites]
    
    # Initialize escape flag
    escape = 0
    
    # Iterate through list of escape mutations
    for escape_mut in top_escape_muts:
        site = int(escape_mut[0][1:-1])
        if sequence[site-1] == escape_mut[0][-1]:
            if print_results:
                print(f"{strain:<75} with \t {escape_mut[0][0]}{site}{escape_mut[0][-1]} \t DMS score: {escape_mut[1]}")
            escape += escape_mut[1]
            
    return escape

# Load alignment and metadata info
natural_seqs_df = pd.DataFrame(columns=["strain", "sequence"])

# Add alignment sequence to dataframe
for curr_fasta in AlignIO.read(natural_GPC_sequence_alignment, "fasta"):
    natural_seqs_df.loc[len(natural_seqs_df.index)] = [
        str(curr_fasta.id),
        str(curr_fasta.seq),
    ] 

# Antibody escape dataframes and percentile cutoffs to use
antibody_files = [
    (filtered_escape_2510C, 0.9),
    (filtered_escape_121F, 0.9), 
    (filtered_escape_377H, 0.9),
    (filtered_escape_256A, 0.9),
    (filtered_escape_372D, 0.9),
    (filtered_escape_89F, 0.9), 
]

for antibody, percentile in antibody_files:
    antibody_name = antibody.split("/")[-1].split("_")[0]
    print(f"{antibody_name} potentially escaped by:")
    natural_seqs_df.apply(lambda x: determine_escape(
        percentile, 
        x["sequence"], 
        antibody, 
        x["strain"],
        print_results=False, # Change to print results
        strong_escape_site=True,
    ), axis=1)
    print()

2510C potentially escaped by:

121F potentially escaped by:

377H potentially escaped by:

256A potentially escaped by:

372D potentially escaped by:

89F potentially escaped by:

# Isolates chosen to validate for escape 
chosen_isolates = [
    "Josiah_NC_004296_reverse_complement_2018-08-13",
    "GA391_OL774861_reverse_complement_1977-XX-XX",
    "LASV_H-sapiens-tc_NGA_2016_IRR_007_MK107922_2016-01-18",
    "Lassa_virus_H-sapiens-wt_NGA_2018_ISTH_1024_MH157037_2018-02-14",
    "LM395-SLE-2009_KM822115_2009-XX-XX",
]

# Create subset of df for chosen isolates
validation_isolates = (
    natural_seqs_df.loc[natural_seqs_df["strain"].isin(chosen_isolates)]
    .reset_index(drop=True)
)

# Get Josiah sequence for comparison
josiah_sequence = validation_isolates.loc[validation_isolates["strain"] == "Josiah_NC_004296_reverse_complement_2018-08-13"].at[0,"sequence"]

# Rename dictionary for virus names
rename_dict = {
    "WT" : "unmutated Josiah strain",
    "OL774861" : "GA391 strain",
    "KM822115" : "LM395 strain",
    "MK107922" : "IRR007 strain",
    "MH157037" : "ISTH1024 strain",
    "Josiah_NC_004296_reverse_complement_2018-08-13" : "unmutated Josiah strain",
    "GA391_OL774861_reverse_complement_1977-XX-XX" : "GA391 strain",
    "LASV_H-sapiens-tc_NGA_2016_IRR_007_MK107922_2016-01-18" : "IRR007 strain",
    "Lassa_virus_H-sapiens-wt_NGA_2018_ISTH_1024_MH157037_2018-02-14" : "ISTH1024 strain",
    "LM395-SLE-2009_KM822115_2009-XX-XX" : "LM395 strain",
    "N89D" : "N89D Josiah strain",
    "K126N" : "K126N Josiah strain",
    "E228D" : "E228D Josiah strain",
    "H398K" : "H398K Josiah strain",
    "D401E" : "D401E Josiah strain",
}

# Rename viruses and column name
validation_isolates["strain"] = (
    validation_isolates["strain"].replace(rename_dict)    
)

# Get counts of variants for each validation site
site_228_count = 0
site_89_count = 0
site_398_count = 0
site_401_count = 0
site_126_count = 0
site_228D_count = 0
site_89D_count = 0
site_398K_count = 0
site_401E_count = 0
site_126N_count = 0
all_position_variants = [0] * 491
all_position_variants_list =  [ [] for _ in range(491) ]
for seq in natural_seqs_df["sequence"].tolist():
    if seq[227] != "E":
        site_228_count += 1
        if seq[227] == "D":
            site_228D_count += 1
    if seq[88] != "N":
        site_89_count += 1
        if seq[88] == "D":
            site_89D_count += 1
    if seq[397] != "H":
        site_398_count += 1
        if seq[397] == "K":
            site_398K_count += 1
    if seq[400] != "D":
        site_401_count += 1
        if seq[400] == "E":
            site_401E_count += 1
    if seq[125] != "K":
        site_126_count += 1
        if seq[125] == "N":
            site_126N_count += 1
    for i in range(491):
        if seq[i] != josiah_sequence[i] and seq[i] != "-":
            all_position_variants[i] += 1
            all_position_variants_list[i].append(josiah_sequence[i] + str(i+1) + seq[i])

# Print counts of each variant for chosen validations
print(f"Total number of sequences analyzed: {len(natural_seqs_df['sequence'].tolist())}")
print(f"Total number of non-Josiah strain variants at position 228: {site_228_count} (E228D variants: {site_228D_count})")
print(f"Total number of non-Josiah strain variants at position 89: {site_89_count} (N89D variants: {site_89D_count})")
print(f"Total number of non-Josiah strain variants at position 398: {site_398_count} (H398K variants: {site_398K_count})")
print(f"Total number of non-Josiah strain variants at position 401: {site_401_count} (D401E variants: {site_401E_count})")
print(f"Total number of non-Josiah strain variants at position 126: {site_126_count} (K126N variants: {site_126N_count})")
print()

Total number of sequences analyzed: 572
Total number of non-Josiah strain variants at position 228: 18 (E228D variants: 16)
Total number of non-Josiah strain variants at position 89: 1 (N89D variants: 1)
Total number of non-Josiah strain variants at position 398: 25 (H398K variants: 1)
Total number of non-Josiah strain variants at position 401: 4 (D401E variants: 4)
Total number of non-Josiah strain variants at position 126: 48 (K126N variants: 1)

def create_line_and_logoplots(
    escape_file,  
    contacts_file,
    func_scores, 
    min_times_seen,  
    n_selections, 
    logo_plot,
    output_file = None,
    sites = None,
    name = None,
    amino_acids_to_color = None,
    only_top_10_sites=False,
):
    """
    Function that filters and writes an antibody escape csv.
    Also creates summed escape profiles and logoplots.
    """

    antibody_name = escape_file.split("/")[-1].split("_")[0]

    # Read data
    escape_df = pd.read_csv(escape_file)
    func_scores = pd.read_csv(func_scores)
    contacts_df = pd.read_csv(contacts_file)
    
    # Create mutation column to match antibody df
    func_scores["site"] = func_scores["site"].astype(str)
    func_scores["mutation"] = func_scores["wildtype"] + func_scores["site"] + func_scores["mutant"]

    # Clip lower scores to 0
    escape_df["escape_median"] = escape_df["escape_median"].clip(lower=0)

    # Summed escape to get top escape sites to show
    summed_df = (
        escape_df
        .groupby(["site", "wildtype"])
        .aggregate({
            "escape_median" : "sum",
        })
        .rename(columns={"escape_median" : "site_escape"})
        .reset_index()
    )
    
    # Top escape sites for each antibody combined to show
    if sites == None:
        sites = sorted(summed_df.nlargest(18, "site_escape")["site"].tolist())
    # Filter for top 10 sites if true
    if only_top_10_sites == True:
        summed_score_dict = dict(zip(summed_df["site"].tolist(), summed_df["site_escape"]))
        filtered_sites = (
            [x[0] for x in sorted([(x, summed_score_dict[x]) for x in sites], key = lambda x: x[1], reverse=True)[0:10]]
        )
        escape_df["show_site"] = escape_df.apply(lambda x: True if (x["site"] in sites and x["site"] in filtered_sites) else False, axis=1)
    else:
        escape_df["show_site"] = escape_df.apply(lambda x: True if x["site"] in sites else False, axis=1)
    
    # Shade contact sites in logo plot
    shade_sites = list(contacts_df.loc[contacts_df["distance"] == 4]["position"].unique())
    
    # **
    # # Uncomment to show antibody contacts
    # print(antibody_name)
    # print(f"Contact sites: {shade_sites}")
    # print()
    # **
    
    escape_df["shade_site"] = escape_df.apply(lambda x: "#DDCC77" if x["site"] in shade_sites else None, axis=1)
    escape_df["shade_alpha"] = 0.35
    escape_df = (
        escape_df.merge(
            summed_df,
            how="left",
            on=["site", "wildtype"],
            validate="many_to_one",
        )
    )

    # Drop extra columns
    escape_df = (
        escape_df.drop(
            escape_df.columns.difference([
                "site", 
                "wildtype",
                "mutant", 
                "escape_median", 
                "show_site", 
                "shade_site",
                "shade_alpha",
                "site_escape", 
            ]), axis=1)
    )

    # Fill in missing sites
    seen_list = [False]*491
    for index in range(len(escape_df.index)):
        site = escape_df.at[index, "site"] - 1 
        seen_list[site] = True
    for index, seen in enumerate(seen_list):
        if seen == False:
            site = index + 1
            # Add missing sites
            escape_df.loc[len(escape_df.index)] = [
                site,
                "X",
                "X",
                0,
                False,
                None,
                None,
                0,
            ]

    # Sort by site
    escape_df = (
        escape_df
        .sort_values(by="site")
        .astype({"mutant" : "str"})
        .reset_index(drop=True)
    )

    # Merge functional and escape dfs
    func_scores["site"] = func_scores["site"].astype("int")
    func_scores = func_scores.loc[func_scores["mutant"] != "*"] # remove stop codons
    func_scores["effect"] = func_scores["effect"].clip(upper=0, lower=-2) # clip scores 
    escape_df = (
        escape_df.merge(
            func_scores,
            how="left",
            on=["site", "wildtype", "mutant"],
            validate="one_to_one",
        )
    )
    escape_df["effect"] = escape_df["effect"].fillna(-2) # missing functional values are filled as -4 to make less visible
    
    # Color specified mutants or color by functional effect
    if amino_acids_to_color != None:
        escape_df["color"] = (
            escape_df.apply(
                lambda x: "#EE7733" if (x["site"], x["mutant"]) in amino_acids_to_color else "#000000", axis=1
            )  
        )
    else:
        # Add color column for logo plots
        func_color_map = dmslogo.colorschemes.ValueToColorMap(
            minvalue=func_scores["effect"].min(),
            maxvalue=func_scores["effect"].max(),
            cmap=matplotlib.colors.ListedColormap(color_gradient_hex("white", "#000000", n=20))
        )
        escape_df = (
            escape_df.assign(
                color=lambda x: x["effect"].map(func_color_map.val_to_color)
            )
        )

    # Add wildtype to each site for logo plot
    escape_df["wt_site"] = escape_df["wildtype"] + escape_df["site"].map(str)

    # Set ylim for each antibody
    fixed_ymin = None
    fixed_ymax = None
    if antibody_name == "2510C":
        fixed_ymin = -6.875
        fixed_ymax = 61.875
    elif antibody_name == "121F":
        fixed_ymin = -1.25
        fixed_ymax = 11.25
    elif antibody_name == "377H" and len(sites) == 33:
        fixed_ymin = -1.5
        fixed_ymax = 13.5
    elif antibody_name == "377H" and len(sites) == 31:
        fixed_ymin = -5 
        fixed_ymax = 45
    elif antibody_name == "256A" and len(sites) == 33:
        fixed_ymin = -0.5
        fixed_ymax = 4.5
    elif antibody_name == "372D":
        fixed_ymin = -0.5
        fixed_ymax = 4.5
    elif antibody_name == "89F":
        fixed_ymin = -1.25
        fixed_ymax = 11.25
    else:
        print("Error! No ylims set!")

    _, logoplot = dmslogo.draw_logo(
        escape_df.query("show_site == True"),
        x_col="site",
        letter_col="mutant",
        letter_height_col="escape_median",
        ax=logo_plot,
        xtick_col="wt_site",
        color_col="color",
        shade_color_col="shade_site",
        shade_alpha_col="shade_alpha",
        draw_line_at_zero="never",
        fixed_ymin=fixed_ymin,
        fixed_ymax=fixed_ymax,
    )

    logoplot.set(ylabel=None, xlabel=None)
    x_labels = logoplot.get_xticklabels()
    logoplot.set_xticklabels(labels=x_labels, rotation=90, horizontalalignment="center", fontsize=6)
    # Change all spines
    for axis in ["top", "bottom", "left", "right"]:
        logoplot.spines[axis].set_linewidth(1)
    logoplot.tick_params(axis="both", length=2, width=1, pad=1)


    # Set antibody specific y axis ticks
    if antibody_name == "2510C":
        yticks = [0, 25, 50]
        logoplot.set_yticks(yticks)
        logoplot.set_yticklabels(labels=["0", "25", "50"], fontsize=6)
        logoplot.set_title(
            antibody_name[0:2] + "." + antibody_name[2:] + " escape for sites different between Josiah and " + name.split(" ")[0] + " sequences", 
            fontsize=8,
            color="#44AA99",
        )
    if antibody_name == "121F":
        yticks = [0, 5, 10]
        logoplot.set_yticks(yticks)
        logoplot.set_yticklabels(labels=["0", "5", "10"], fontsize=6)
        logoplot.set_title(
            antibody_name[0:2] + "." + antibody_name[2:] + " escape for sites different between Josiah and " + name.split(" ")[0] + " sequences",
            fontsize=8,
            color="#999933",
        )
    if antibody_name == "377H" and len(sites) == 33:
        yticks = [0, 5, 10]
        logoplot.set_yticks(yticks)
        logoplot.set_yticklabels(labels=["0", "5", "10"], fontsize=6)
        logoplot.set_title(
            antibody_name[0:2] + "." + antibody_name[2:] + " escape for sites different between Josiah and " + name.split(" ")[0] + " sequences", 
            fontsize=8,
            color="#AA4499",
        )
    if antibody_name == "377H" and len(sites) == 31:
        yticks = [0, 20, 40]
        logoplot.set_yticks(yticks)
        logoplot.set_yticklabels(labels=["0", "20", "40"], fontsize=6)
        logoplot.set_title(
            antibody_name[0:2] + "." + antibody_name[2:] + " escape for sites different between Josiah and " + name.split(" ")[0] + " sequences",
            fontsize=8,
            color="#AA4499",
        )
    if antibody_name == "256A" and len(sites) == 33:
        yticks = [0, 2, 4]
        logoplot.set_yticks(yticks)
        logoplot.set_yticklabels(labels=["0", "2", "4"], fontsize=6)
        logoplot.set_title(
            antibody_name[0:2] + "." + antibody_name[2:] + " escape for sites different between Josiah and " + name.split(" ")[0] + " sequences", 
            fontsize=8,
            color="#AA4499",
        )
    if antibody_name == "372D":
        yticks = [0, 2, 4]
        logoplot.set_yticks(yticks)
        logoplot.set_yticklabels(labels=["0", "2", "4"], fontsize=6)
        logoplot.set_title(
            antibody_name[0:2] + "." + antibody_name[2:] + " escape for sites different between Josiah and " + name.split(" ")[0] + " sequences",
            fontsize=8,
            color="#AA4499",
        )
    if antibody_name == "89F":
        yticks = [0, 5, 10]
        logoplot.set_yticks(yticks)
        logoplot.set_yticklabels(labels=["0", "5", "10"], fontsize=6)
        logoplot.set_title(
            antibody_name[0] + "." + antibody_name[1:] + " escape for sites different between Josiah and " + name.split(" ")[0] + " sequences",
            fontsize=8,
            color="#117733",
        )

def get_site_differences(seq1, seq2):
    """
    Function that returns a list of all sites
    that are different between two sequences.
    """
    list_of_sites = []
    list_of_muts = []
    site = 1
    for s1, s2 in zip(seq1, seq2):
        if s1 != s2 and s1 != "-" and s2 != "-":
            list_of_sites.append(site)
            list_of_muts.append(s2)
        site += 1
    return list_of_sites, tuple(zip(list_of_sites, list_of_muts))

# Get list of sequence differences compared to Josiah
list_of_different_sites = []
for index in validation_isolates.index:
    curr_seq_name = validation_isolates.at[index, "strain"]
    curr_seq = validation_isolates.at[index, "sequence"]
    site_differences, mut_differences = get_site_differences(josiah_sequence, curr_seq)
    list_of_different_sites.append((curr_seq_name, site_differences, mut_differences))

# Antibody contact files and escape files
contacts_files = [
    contacts_2510C,
    contacts_121F,
    contacts_377H,
    # contacts_256A,
    contacts_377H, # duplicate 37.7H because validated with two strains
    contacts_372D,
    contacts_89F,
]

filtered_escape_files = [
    filtered_escape_2510C,
    filtered_escape_121F,
    filtered_escape_377H,
    # filtered_escape_256A,
    filtered_escape_377H, # duplicate 37.7H because validated with two strains
    filtered_escape_372D,
    filtered_escape_89F,
]

# Set figure size and subplots
fig, axes = plt.subplots(
    6, 
    1, 
    figsize=(4.5, 6), 
    # sharex="col"
)

# Iterate through list of antibody files
for i in range(len(antibody_files)):
    name = None
    sites = None
    AA_to_color = None
    if i == 0:
        name = list_of_different_sites[1][0]
        sites = list_of_different_sites[1][1]
        AA_to_color = list_of_different_sites[1][2]
    elif i == 1:
        name = list_of_different_sites[4][0]
        sites = list_of_different_sites[4][1]
        AA_to_color = list_of_different_sites[4][2]
    elif i == 2:
        name = list_of_different_sites[1][0]
        sites = list_of_different_sites[1][1]
        AA_to_color = list_of_different_sites[1][2]
    elif i == 3:
        name = list_of_different_sites[2][0]
        sites = list_of_different_sites[2][1]
        AA_to_color = list_of_different_sites[2][2]
    elif i == 4:
        name = list_of_different_sites[1][0]
        sites = list_of_different_sites[1][1]
        AA_to_color = list_of_different_sites[1][2]
    elif i == 5:
        name = list_of_different_sites[3][0]
        sites = list_of_different_sites[3][1]
        AA_to_color = list_of_different_sites[3][2]
    else:
        print("Error! Index out of range!")

    create_line_and_logoplots(
        filtered_escape_files[i], 
        contacts_files[i],
        func_scores, 
        min_times_seen, 
        n_selections, 
        axes[i],
        sites=sorted(sites),
        name=name,
        amino_acids_to_color=AA_to_color,
        only_top_10_sites=True,
    )

fig.subplots_adjust(
    bottom = 0,   # the bottom of the subplots of the figure
    top = 1,      # the top of the subplots of the figure
    wspace = 0,   # the amount of width reserved for blank space between subplots
    hspace = 0.8,   # the amount of height reserved for white space between subplots
)

# Common Y axis labels
fig.text(-0.075, 0.5, "site escape", va="center", rotation="vertical", fontsize=8)
fig.text(0.5, -0.07, "site", ha="center", fontsize=8)

# Make output dir if doesn't exist
if not os.path.exists(out_dir_escape):
    os.mkdir(out_dir_escape)

# Save fig
plt.savefig(escape_top10_image_path)

# Set figure size and subplots
fig, axes = plt.subplots(
    6, 
    1, 
    figsize=(6, 6), 
    # sharex="col"
)

# Iterate through list of antibody files
for i in range(len(antibody_files)):
    name = None
    sites = None
    AA_to_color = None
    if i == 0:
        name = list_of_different_sites[1][0]
        sites = list_of_different_sites[1][1]
        AA_to_color = list_of_different_sites[1][2]
    elif i == 1:
        name = list_of_different_sites[4][0]
        sites = list_of_different_sites[4][1]
        AA_to_color = list_of_different_sites[4][2]
    elif i == 2:
        name = list_of_different_sites[1][0]
        sites = list_of_different_sites[1][1]
        AA_to_color = list_of_different_sites[1][2]
    elif i == 3:
        name = list_of_different_sites[2][0]
        sites = list_of_different_sites[2][1]
        AA_to_color = list_of_different_sites[2][2]
    elif i == 4:
        name = list_of_different_sites[1][0]
        sites = list_of_different_sites[1][1]
        AA_to_color = list_of_different_sites[1][2]
    elif i == 5:
        name = list_of_different_sites[3][0]
        sites = list_of_different_sites[3][1]
        AA_to_color = list_of_different_sites[3][2]
    else:
        print("Error! Index out of range!")

    create_line_and_logoplots(
        filtered_escape_files[i], 
        contacts_files[i],
        func_scores, 
        min_times_seen, 
        n_selections, 
        axes[i],
        sites=sorted(sites),
        name=name,
        amino_acids_to_color=AA_to_color,
        only_top_10_sites=False,
    )

fig.subplots_adjust(
    bottom = 0,   # the bottom of the subplots of the figure
    top = 1,      # the top of the subplots of the figure
    wspace = 0,   # the amount of width reserved for blank space between subplots
    hspace = 0.8,   # the amount of height reserved for white space between subplots
)

# Common Y axis labels
fig.text(-0.075, 0.5, "site escape", va="center", rotation="vertical", fontsize=8)
fig.text(0.5, -0.07, "site", ha="center", fontsize=8)

# Make output dir if doesn't exist
if not os.path.exists(out_dir_escape):
    os.mkdir(out_dir_escape)

# Save fig
plt.savefig(escape_all_image_path)

# Extract natural sequence and site differences
name = list_of_different_sites[1][0]
sites = list_of_different_sites[1][1]
AA_to_color = list_of_different_sites[1][2]

# Set figure size and subplots
fig, axes = plt.subplots( 
    figsize=(6, 1), 
    # sharex="col"
)

create_line_and_logoplots(
    filtered_escape_256A, 
    contacts_256A,
    func_scores, 
    min_times_seen, 
    n_selections, 
    axes,
    sites=sorted(sites),
    name=name,
    amino_acids_to_color=AA_to_color,
    only_top_10_sites=False,
)

# Common Y axis labels
fig.text(0.05, 0.5, "site escape", va="center", rotation="vertical", fontsize=8)
fig.text(0.5, -0.3, "site", ha="center", fontsize=8)

# Make output dir if doesn't exist
if not os.path.exists(out_dir_escape):
    os.mkdir(out_dir_escape)

# Save fig
plt.savefig(escape_all_256A_image_path)

# Rename column name
validation_isolates = (
    validation_isolates.rename(columns={"strain" : "virus"})
)

# Read nuetralization data
frac_infected_natural_isolates = pd.read_csv(fraction_infected_natural_isolates)

# Rename viruses
frac_infected_natural_isolates["virus"] = (
    frac_infected_natural_isolates["virus"].replace(rename_dict)
)

# Fit hill curves using neutcurve
fits = neutcurve.curvefits.CurveFits(
    data=frac_infected_natural_isolates,
    fixbottom=0,
    fixtop=1,
)

# Plot neut data
fig, axes = fits.plotGrid(
    {
        (0, 0) : ("25.10C",
                  [
                      {
                          "serum" : "25.10C", 
                          "virus" : "unmutated Josiah strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[0],
                          "marker" : "o",
                          "label" : "unmutated\nJosiah"
                      },
                      {
                          "serum" : "25.10C", 
                          "virus" : "GA391 strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[12],
                          "marker" : "o",
                          "label" : "GA391"
                      },
                      {
                          "serum" : "25.10C", 
                          "virus" : "E228D Josiah strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[10],
                          "marker" : "o",
                          "label" : "E228D\nJosiah"
                      },
                  ]
                 ),
        (1, 0) : ("12.1F",
                  [
                      {
                          "serum" : "12.1F", 
                          "virus" : "unmutated Josiah strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[0],
                          "marker" : "o",
                          "label" : "unmutated\nJosiah"
                      },
                      {
                          "serum" : "12.1F", 
                          "virus" : "LM395 strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[12],
                          "marker" : "o",
                          "label" : "LM395"
                      },
                      {
                          "serum" : "12.1F", 
                          "virus" : "N89D Josiah strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[10],
                          "marker" : "o",
                          "label" : "*N89D\nJosiah"
                      },
                  ]
                 ),
        (2, 0) : ("37.7H",
                  [
                      {
                          "serum" : "37.7H", 
                          "virus" : "unmutated Josiah strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[0],
                          "marker" : "o",
                          "label" : "unmutated\nJosiah"
                      },
                      {
                          "serum" : "37.7H", 
                          "virus" : "GA391 strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[12],
                          "marker" : "o",
                          "label" : "GA391"
                      },
                      {
                          "serum" : "37.7H", 
                          "virus" : "H398K Josiah strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[10],
                          "marker" : "o",
                          "label" : "H398K\nJosiah"
                      },
                  ]
                 ),
        (3, 0) : ("37.7H",
                  [
                      {
                          "serum" : "37.7H", 
                          "virus" : "unmutated Josiah strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[0],
                          "marker" : "o",
                          "label" : "unmutated\nJosiah"
                      },
                      {
                          "serum" : "37.7H", 
                          "virus" : "IRR007 strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[12],
                          "marker" : "o",
                          "label" : "IRR007"
                      },
                      {
                          "serum" : "37.7H", 
                          "virus" : "D401E Josiah strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[10],
                          "marker" : "o",
                          "label" : "D401E\nJosiah"
                      },
                  ]
                 ),
        (4, 0) : ("37.2D",
                  [
                      {
                          "serum" : "37.2D", 
                          "virus" : "unmutated Josiah strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[0],
                          "marker" : "o",
                          "label" : "unmutated\nJosiah"
                      },
                      {
                          "serum" : "37.2D", 
                          "virus" : "GA391 strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[12],
                          "marker" : "o",
                          "label" : "GA391"
                      },
                      {
                          "serum" : "37.2D", 
                          "virus" : "H398K Josiah strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[10],
                          "marker" : "o",
                          "label" : "H398K\nJosiah"
                      },
                  ]
                 ),
        (5, 0) : ("8.9F",
                  [
                      {
                          "serum" : "8.9F", 
                          "virus" : "unmutated Josiah strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[0],
                          "marker" : "o",
                          "label" : "unmutated\nJosiah"
                      },
                      {
                          "serum" : "8.9F", 
                          "virus" : "ISTH1024 strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[12],
                          "marker" : "o",
                          "label" : "ISTH1024"
                      },
                      {
                          "serum" : "8.9F", 
                          "virus" : "K126N Josiah strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[10],
                          "marker" : "o",
                          "label" : "K126N\nJosiah"
                      },
                  ]
                 ),
    },
    sharex=True,
    sharey=False,
    xlabel="",
    ylabel="",
    attempt_shared_legend=False,
    despine=True,
)


antibody_names = [
    "25.10C",
    "12.1F",
    "37.7H",
    "37.7H",
    "37.2D",
    "8.9F",
]

for index in range(6):
    if antibody_names[index] == "25.10C":
        axes[index,0].set_title(
            "antibody " + antibody_names[index], 
            # weight="bold",
            fontsize=8,
            color="#44AA99",
        )
    elif antibody_names[index] == "12.1F":
        axes[index,0].set_title(
            "antibody " + antibody_names[index], 
            # weight="bold",
            fontsize=8,
            color="#999933",
        )
    elif antibody_names[index] == "37.7H" or antibody_names[index] == "37.2D" :
        axes[index,0].set_title(
            "antibody " + antibody_names[index], 
            # weight="bold",
            fontsize=8,
            color="#AA4499",
        )
    elif antibody_names[index] == "8.9F":
        axes[index,0].set_title(
            "antibody " + antibody_names[index], 
            # weight="bold",
            fontsize=8,
            color="#117733",
        )
    axes[index,0].set_ylim(-0.1, 1.3)
    axes[index,0].set_yticks([0, 0.5, 1.0])
    axes[index,0].set_yticklabels(labels=[0, 0.5, 1.0], fontsize=6)
    axes[index,0].set_xlim(0.0005, 12.5)
    axes[index,0].set_xticks([0.001, 0.01, 0.1, 1, 10,])
    axes[index,0].set_xticklabels(labels=["$10^{-3}$", "$10^{-2}$", "$10^{-1}$", "$10^0$", "$10^1$",], fontsize=6)
    plt.setp(axes[index,0].collections, alpha=0.8, linewidths=0.5, colors="black") # for vertical error bar segment
    plt.setp(axes[index,0].lines, alpha=0.8, markeredgewidth=0.5, markeredgecolor="black", linewidth=1, markersize=4) # for the lines and markers
    sns.move_legend(
        axes[index,0], 
        loc="upper left",
        borderaxespad=0,
        frameon=False,
        bbox_to_anchor=(1, 1.3),
        fontsize=6,
        markerscale=1,
        handletextpad=0.1,
        title="Lassa GPC",
        title_fontproperties={"weight" : "bold", "size" : 6},
        alignment="center",
    )

    # Add edges to legend markers to match scatter plot
    for ha in axes[index,0].legend_.legendHandles:
        ha.set_markeredgecolor("black")
        ha.set_markeredgewidth(0.5)
        ha.set_linewidth(0)
        ha.set_markersize(5)
        
    # Change all spines
    for axis in ["top", "bottom", "left", "right"]:
        axes[index,0].spines[axis].set_linewidth(1)
    axes[index,0].tick_params(axis="both", length=2, width=1)

# Adjust spacing of subplots
fig.subplots_adjust(
    left  = 0,  # the left side of the subplots of the figure
    right = 1,    # the right side of the subplots of the figure
    bottom = 0,   # the bottom of the subplots of the figure
    top = 1,      # the top of the subplots of the figure
    wspace = 0,   # the amount of width reserved for blank space between subplots
    hspace = 0.8,   # the amount of height reserved for white space between subplots
)

# Common X and Y axis labels
fig.text(-0.4, 0.5, "fraction infectivity", va="center", rotation="vertical", fontsize=8)
fig.text(0.5, -0.05, "concentration (\u03BCg/mL)", ha="center", fontsize=8)

width = 0.9
height = 6
fig.set_size_inches(width, height)

# Make output dir if doesn't exist
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

# Save fig
plt.savefig(neuts_image_path)

# Plot neut data
fig, axes = fits.plotGrid(
    {
        (0, 0) : ("25.6A",
                  [
                      {
                          "serum" : "25.6A", 
                          "virus" : "unmutated Josiah strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[0],
                          "marker" : "o",
                          "label" : "unmutated\nJosiah"
                      },
                      {
                          "serum" : "25.6A", 
                          "virus" : "GA391 strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[12],
                          "marker" : "o",
                          "label" : "GA391"
                      },
                      {
                          "serum" : "25.6A", 
                          "virus" : "H398K Josiah strain",
                          "replicate" : "average",
                          "color" : tol_muted_adjusted[10],
                          "marker" : "o",
                          "label" : "H398K\nJosiah"
                      },
                  ]
                 ),
    },
    sharex=True,
    sharey=False,
    xlabel="",
    ylabel="",
    attempt_shared_legend=False,
    despine=True,
)

axes[0,0].set_title(
    "antibody " + "25.6A", 
    # weight="bold",
    fontsize=8,
    color="#AA4499",
)
axes[0,0].set_ylim(-0.1, 1.3)
axes[0,0].set_yticks([0, 0.5, 1.0])
axes[0,0].set_yticklabels(labels=[0, 0.5, 1.0], fontsize=6)
axes[0,0].set_xlim(0.0005, 12.5)
axes[0,0].set_xticks([0.001, 0.01, 0.1, 1, 10,])
axes[0,0].set_xticklabels(labels=["$10^{-3}$", "$10^{-2}$", "$10^{-1}$", "$10^0$", "$10^1$",], fontsize=6)
plt.setp(axes[0,0].collections, alpha=0.8, linewidths=0.5, colors="black") # for vertical error bar segment
plt.setp(axes[0,0].lines, alpha=0.8, markeredgewidth=0.5, markeredgecolor="black", linewidth=1, markersize=4) # for the lines and markers
sns.move_legend(
    axes[0,0], 
    loc="upper left",
    borderaxespad=0,
    frameon=False,
    bbox_to_anchor=(1, 1.3),
    fontsize=6,
    markerscale=1,
    handletextpad=0.1,
    title="Lassa GPC",
    title_fontproperties={"weight" : "bold", "size" : 6},
    alignment="center",
)

# Add edges to legend markers to match scatter plot
for ha in axes[0,0].legend_.legendHandles:
    ha.set_markeredgecolor("black")
    ha.set_markeredgewidth(0.5)
    ha.set_linewidth(0)
    ha.set_markersize(5)
    
# Change all spines
for axis in ["top", "bottom", "left", "right"]:
    axes[0,0].spines[axis].set_linewidth(1)
axes[0,0].tick_params(axis="both", length=2, width=1)

# Adjust spacing of subplots
fig.subplots_adjust(
    left  = 0,  # the left side of the subplots of the figure
    right = 1,    # the right side of the subplots of the figure
    bottom = 0,   # the bottom of the subplots of the figure
    top = 1,      # the top of the subplots of the figure
    wspace = 0,   # the amount of width reserved for blank space between subplots
    hspace = 0.8,   # the amount of height reserved for white space between subplots
)

# Common X and Y axis labels
fig.text(-0.4, 0.5, "fraction infectivity", va="center", rotation="vertical", fontsize=8)
fig.text(0.5, -0.4, "concentration (\u03BCg/mL)", ha="center", fontsize=8)

width = 0.9
height = 0.7
fig.set_size_inches(width, height)

# Make output dir if doesn't exist
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

# Save fig
plt.savefig(neuts_256A_image_path)

# IC values to calculate
fitParams = fits.fitParams(ics=[50, 80, 99])[["serum", "virus", "ic50", "ic50_bound"]]

# Merge isolate sequence and neut dfs
validation_df = (
    fitParams.merge(
        validation_isolates,
        how="left",
        on="virus",
        validate="many_to_one"
    )
)

# Add sinlge mutant sequences
validation_df.at[2, "sequence"] = (
    josiah_sequence[:int(validation_df.at[2,"virus"].split()[0][1:-1])-1] + validation_df.at[2,"virus"].split()[0][-1] + josiah_sequence[int(validation_df.at[2,"virus"].split()[0][1:-1]):]
)

validation_df.at[5, "sequence"] = (
    josiah_sequence[:int(validation_df.at[5,"virus"].split()[0][1:-1])-1] + validation_df.at[5,"virus"].split()[0][-1] + josiah_sequence[int(validation_df.at[5,"virus"].split()[0][1:-1]):]
)

validation_df.at[8, "sequence"] = (
    josiah_sequence[:int(validation_df.at[8,"virus"].split()[0][1:-1])-1] + validation_df.at[8,"virus"].split()[0][-1] + josiah_sequence[int(validation_df.at[8,"virus"].split()[0][1:-1]):]
)

validation_df.at[10, "sequence"] = (
    josiah_sequence[:int(validation_df.at[10,"virus"].split()[0][1:-1])-1] + validation_df.at[10,"virus"].split()[0][-1] + josiah_sequence[int(validation_df.at[10,"virus"].split()[0][1:-1]):]
)

validation_df.at[13, "sequence"] = (
    josiah_sequence[:int(validation_df.at[13,"virus"].split()[0][1:-1])-1] + validation_df.at[13,"virus"].split()[0][-1] + josiah_sequence[int(validation_df.at[13,"virus"].split()[0][1:-1]):]
)

validation_df.at[16, "sequence"] = (
    josiah_sequence[:int(validation_df.at[16,"virus"].split()[0][1:-1])-1] + validation_df.at[16,"virus"].split()[0][-1] + josiah_sequence[int(validation_df.at[16,"virus"].split()[0][1:-1]):]
)

validation_df.at[19, "sequence"] = (
    josiah_sequence[:int(validation_df.at[19,"virus"].split()[0][1:-1])-1] + validation_df.at[19,"virus"].split()[0][-1] + josiah_sequence[int(validation_df.at[19,"virus"].split()[0][1:-1]):]
)

# Antibody conversion dict
antibody_file_dict = {
    "25.10C" : filtered_escape_2510C,
    "12.1F" : filtered_escape_121F,
    "37.7H" : filtered_escape_377H,
    "25.6A" : filtered_escape_256A,
    "37.2D" : filtered_escape_372D,
    "8.9F" : filtered_escape_89F,
}

# Calculate total escape for each sequence using an additive model
validation_df["total_escape"] = (
    validation_df.apply(lambda x: determine_escape(
        0, 
        x["sequence"], 
        antibody_file_dict[x["serum"]], 
        x["virus"],
    ), axis=1)
)

# Rename columns and reformat columns
validation_df = (
    validation_df.rename(columns={
        "ic50_bound" : "lower bound",
        "serum" : "antibody",
    })
)
validation_df["lower bound"] = (
    validation_df["lower bound"].replace({"interpolated" : False, "lower" : True})
)

# Mark strains as natural isolates or single mutants
validation_df["Lassa GPC"] = (
    validation_df.apply(lambda x: "unmutated\nJosiah" if x["virus"] == "unmutated Josiah strain" else 
        ("Josiah\nvariant" if x["virus"] in [
            "N89D Josiah strain",
            "K126N Josiah strain",
            "E228D Josiah strain",
            "H398K Josiah strain",
            "D401E Josiah strain",
        ] else "natural\nsequence"), axis=1)
)

# Set figure size and subplots
fig, axes = plt.subplots(
    1, 
    5, 
    figsize=(6.25, 1.25), 
    sharey=True,
    sharex=True,
)
color_dict = {
    "25.10C" : "#44AA99", 
    "12.1F" : "#999933", 
    "37.7H" : "#AA4499", 
    "37.2D" : "#AA4499", 
    "8.9F" : "#117733",
}

for index,antibody in enumerate(["25.10C", "12.1F", "37.7H", "37.2D", "8.9F"]):

    curr_antibody = (
        validation_df.loc[validation_df["antibody"] == antibody]
        .reset_index(drop=True)
    )
        

    # Calculate correlation between predicted and measured
    r, p = sp.stats.pearsonr(
        x=np.log10(curr_antibody["ic50"]), # log10 because plotting on log scale
        y=curr_antibody["total_escape"]
    )
    print(f"R={r}")
    print(f"R^2={r**2}")
    
    corr_chart = sns.scatterplot(
        data=curr_antibody,
        x="ic50", 
        y="total_escape",
        hue="Lassa GPC",
        palette={
            "unmutated\nJosiah" : tol_muted_adjusted[0],
            "natural\nsequence" : tol_muted_adjusted[12],
            "Josiah\nvariant" : tol_muted_adjusted[10],
        },
        style="lower bound",
        markers=["o", "s"],
        alpha=0.8,
        edgecolor="black",
        ax=axes[index],
        s=25,
    )
    # Change all spines
    for axis in ["top", "bottom", "left", "right"]:
        corr_chart.spines[axis].set_linewidth(1)
    sns.despine()
    corr_chart.tick_params(axis="both", length=3, width=1)
    corr_chart.set(xlabel=None)
    corr_chart.set_title(
        antibody,
        # weight="bold",
        fontsize=8,
        color=color_dict[antibody]
    )
    corr_chart.set_ylabel(
        "escape score measured\nby DMS (arbitrary units)", 
        # weight="bold",
        fontsize=8,
    )
    corr_chart.set_xscale("log")
    corr_chart.xaxis.set_minor_locator(mticker.NullLocator()) # no minor ticks
    corr_chart.set_xlim(0.005, 500)
    corr_chart.set_ylim(-0.5, 8.5)
    corr_chart.set_xticks([0.01, 1, 100])
    corr_chart.set_xticklabels(["$10^{-2}$", "$10^{0}$", "$10^{2}$"], size=6)
    corr_chart.set_yticks([0, 2, 4, 6, 8])
    corr_chart.set_yticklabels(corr_chart.get_yticks(), size=6)
    if index == 2:
        sns.move_legend(
            corr_chart, 
            "upper left", 
            bbox_to_anchor=(3.5,1),
            fontsize=6,
            markerscale=1,
            handletextpad=0.1,
            frameon=False,
            borderaxespad=0.75,
        )
        # Add edges to legend markers to match scatter plot
        for ha in corr_chart.legend_.legendHandles:
            ha.set_edgecolor("black")
            ha.set_linewidths(0.5)
        corr_chart.get_legend().get_texts()[0].set_weight("bold")
        corr_chart.get_legend().get_texts()[4].set_weight("bold")
    else:
        corr_chart.get_legend().remove()

    # Add correlation to chart
    corr_chart.text(
        0.01, 
        7.5,
        f"r={r:.2f}", 
        horizontalalignment="left",  
        weight="bold",
        fontsize=6,
    )
    
    # Label points
    for i in range(0, curr_antibody.shape[0]):
        x_pos = curr_antibody.at[i, "ic50"]
        y_pos = curr_antibody.at[i, "total_escape"]
        name = curr_antibody.at[i, "virus"].split(" ")[0]
        if antibody == "12.1F" and name == "N89D":
            corr_chart.text(
                x_pos+15, 
                y_pos, 
                f"*{name}", 
                horizontalalignment="left",  
                # weight="bold",
                fontsize=6,
                style="italic",
            )
        elif antibody == "12.1F" and name == "LM395":
            corr_chart.text(
                x_pos+15, 
                y_pos, 
                f"{name}", 
                horizontalalignment="left",  
                # weight="bold",
                fontsize=6,
            )
        elif antibody == "37.7H" and name == "D401E":
            corr_chart.text(
                x_pos+0.1, 
                y_pos, 
                f"{name}", 
                horizontalalignment="left",  
                # weight="bold",
                fontsize=6,
            )
        elif antibody == "37.7H" and name == "IRR007":
            corr_chart.text(
                x_pos+15, 
                y_pos+0.25, 
                f"{name}", 
                horizontalalignment="left",  
                # weight="bold",
                fontsize=6,
            )
        elif antibody == "37.7H" and name == "GA391":
            corr_chart.text(
                x_pos+15, 
                y_pos-0.75, 
                f"{name}", 
                horizontalalignment="left",  
                # weight="bold",
                fontsize=6,
            )
        elif antibody == "37.2D" and name == "GA391":
            corr_chart.text(
                x_pos+0.5, 
                y_pos, 
                f"{name}", 
                horizontalalignment="left",  
                # weight="bold",
                fontsize=6,
            )
        elif antibody == "37.2D" and name == "H398K":
            corr_chart.text(
                x_pos+0.75, 
                y_pos-0.25, 
                f"{name}", 
                horizontalalignment="left",  
                # weight="bold",
                fontsize=6,
            )
        else:
            if name != "unmutated":
                corr_chart.text(
                    x_pos, 
                    y_pos+0.5, 
                    f"{name}", 
                    horizontalalignment="left",  
                    # weight="bold",
                    fontsize=6,
                )
            
    # Set square ratio
    corr_chart.set_box_aspect(1)

# Common X axis labels
fig.text(0.5, -0.1, "IC50 (\u03BCg/mL) measured by pseuodvirus neutralization", ha="center", rotation="horizontal", fontsize=8)

# Make output dir if doesn't exist
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

# Save fig
fig.savefig(corr_image_path)

R=0.9314336729293028
R^2=0.8675686870665714
R=0.9841464060441035
R^2=0.9685441485295255
R=0.9856108104077421
R^2=0.9714286695926062
R=0.8410533082217173
R^2=0.707370667270695
R=0.7334543504174906
R^2=0.5379552841463432

# Set figure size and subplots
fig, axes = plt.subplots(
    figsize=(1.25, 1.25), 
)
color_dict = {
    "25.10C" : "#44AA99", 
    "12.1F" : "#999933", 
    "37.7H" : "#AA4499", 
    "25.6A" : "#AA4499",
    "37.2D" : "#AA4499", 
    "8.9F" : "#117733",
}

for index,antibody in enumerate(["25.6A"]):

    curr_antibody = (
        validation_df.loc[validation_df["antibody"] == antibody]
        .reset_index(drop=True)
    )
        

    # Calculate correlation between predicted and measured
    r, p = sp.stats.pearsonr(
        x=np.log10(curr_antibody["ic50"]), # log10 because plotting on log scale
        y=curr_antibody["total_escape"]
    )
    print(f"R={r}")
    print(f"R^2={r**2}")
    
    corr_chart = sns.scatterplot(
        data=curr_antibody,
        x="ic50", 
        y="total_escape",
        hue="Lassa GPC",
        palette={
            "unmutated\nJosiah" : tol_muted_adjusted[0],
            "natural\nsequence" : tol_muted_adjusted[12],
            "Josiah\nvariant" : tol_muted_adjusted[10],
        },
        style="lower bound",
        markers=["o", "s"],
        alpha=0.8,
        edgecolor="black",
        s=25,
    )
    # Change all spines
    for axis in ["top", "bottom", "left", "right"]:
        corr_chart.spines[axis].set_linewidth(1)
    sns.despine()
    corr_chart.tick_params(axis="both", length=3, width=1)
    corr_chart.set_xlabel(
        "IC50 (\u03BCg/mL) measured\nby pseuodvirus neutralization",
        fontsize=8,
    )
    corr_chart.set_title(
        antibody,
        # weight="bold",
        fontsize=8,
        color=color_dict[antibody]
    )
    corr_chart.set_ylabel(
        "escape score measured\nby DMS (arbitrary units)", 
        # weight="bold",
        fontsize=8,
    )
    corr_chart.set_xscale("log")
    corr_chart.xaxis.set_minor_locator(mticker.NullLocator()) # no minor ticks
    corr_chart.set_xlim(0.005, 500)
    corr_chart.set_ylim(-0.5, 8.5)
    corr_chart.set_xticks([0.01, 1, 100])
    corr_chart.set_xticklabels(["$10^{-2}$", "$10^{0}$", "$10^{2}$"], size=6)
    corr_chart.set_yticks([0, 2, 4, 6, 8])
    corr_chart.set_yticklabels(corr_chart.get_yticks(), size=6)
    sns.move_legend(
        corr_chart, 
        "upper left", 
        bbox_to_anchor=(1.1,1.2),
        fontsize=6,
        markerscale=1,
        handletextpad=0.1,
        frameon=False,
        borderaxespad=0.75,
    )
    # Add edges to legend markers to match scatter plot
    for ha in corr_chart.legend_.legendHandles:
        ha.set_edgecolor("black")
        ha.set_linewidths(0.5)
    corr_chart.get_legend().get_texts()[0].set_weight("bold")
    corr_chart.get_legend().get_texts()[4].set_weight("bold")

    # Add correlation to chart
    corr_chart.text(
        0.01, 
        7.5,
        f"r={r:.2f}", 
        horizontalalignment="left",  
        weight="bold",
        fontsize=6,
    )
    
    # Label points
    for i in range(0, curr_antibody.shape[0]):
        x_pos = curr_antibody.at[i, "ic50"]
        y_pos = curr_antibody.at[i, "total_escape"]
        name = curr_antibody.at[i, "virus"].split(" ")[0]
        if antibody == "25.6A" and name == "GA391":
            corr_chart.text(
                x_pos, 
                y_pos+0.5, 
                f"{name}", 
                horizontalalignment="left",  
                fontsize=6,
            )
        else:
            if name != "unmutated":
                corr_chart.text(
                    x_pos, 
                    y_pos+0.4, 
                    f"{name}", 
                    horizontalalignment="left",  
                    # weight="bold",
                    fontsize=6,
                )
            
    # Set square ratio
    corr_chart.set_box_aspect(1)

# Make output dir if doesn't exist
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

# Save fig
fig.savefig(corr_256A_image_path)

R=0.9806467345321717
R^2=0.9616680179486116

def get_natural_sequence_counts(site, amino_acid, natural_seqs_df):
    """
    Function that counts occurences of an amino acid at a site
    across a dataframe of sequences.
    """
    count = 0
    for seq in natural_seqs_df["sequence"].tolist():
        if seq[site-1] == amino_acid:
            count += 1
    return count
            

# Load data as dataframe
functional_scores = pd.read_csv(func_scores)

# Filter for minimum selections, times seen and no stop codons
functional_scores = (
    functional_scores.query(
        "n_selections >= @n_selections and times_seen >= @min_times_seen and mutant != '*'"
    )
    .drop(columns=["times_seen", "effect_std", "n_selections"])
    .reset_index(drop=True)
)

# Create a wildtype df to fill in missing Josiah/wildtype measurements as 0
josiah_df = pd.DataFrame(
    zip(list(range(1,492)), josiah_sequence, josiah_sequence, [0]*491, [True]*491),
    columns=["site", "wildtype", "mutant", "effect", "Josiah reference"],
)
AA_level_df = (
    pd.concat([functional_scores, josiah_df], ignore_index=True)
    .sort_values(by="site")
    .reset_index(drop=True)
    .fillna(False)
)

# Get natural sequence counts of each mutant and calculate mutation frequencies
# compared to the Josiah reference
AA_level_df["natural_counts"] = (
    AA_level_df.apply(lambda x: get_natural_sequence_counts(x["site"], x["mutant"], natural_seqs_df), axis=1)
)
number_sequences = AA_level_df["natural_counts"].max() # 572 sequences
AA_level_df["mutation_frequency"] = (
    AA_level_df.apply(lambda x: x["natural_counts"]/number_sequences if x["Josiah reference"] == False else None, axis=1)
)


# Add escape to dataframe for each antibody
for antibody_file,_ in antibody_files:

    antibody_name = antibody_file.split("/")[-1].split("_")[0]

    # Load data as dataframe
    escape_df = pd.read_csv(antibody_file)

    # Clip lower scores to 0
    escape_df["escape_median"] = escape_df["escape_median"].clip(lower=0)

    # Rename escape column to include antibody name
    escape_df = escape_df.rename(columns={"escape_median" : "escape_" + antibody_name})

    # Merge dataframes
    AA_level_df = (
        AA_level_df.merge(
            escape_df[["site", "wildtype", "mutant", "escape_" + antibody_name]],
            how="left",
            on=["site", "wildtype", "mutant"],
            validate="one_to_one",
        )
    )

# Fill in missing wildtype escape values
AA_level_df["escape_2510C"] = AA_level_df.apply(lambda x: 0 if x["wildtype"] == x["mutant"] else x["escape_2510C"], axis=1)
AA_level_df["escape_121F"] = AA_level_df.apply(lambda x: 0 if x["wildtype"] == x["mutant"] else x["escape_121F"], axis=1)
AA_level_df["escape_377H"] = AA_level_df.apply(lambda x: 0 if x["wildtype"] == x["mutant"] else x["escape_377H"], axis=1)
AA_level_df["escape_256A"] = AA_level_df.apply(lambda x: 0 if x["wildtype"] == x["mutant"] else x["escape_256A"], axis=1)
AA_level_df["escape_372D"] = AA_level_df.apply(lambda x: 0 if x["wildtype"] == x["mutant"] else x["escape_372D"], axis=1)
AA_level_df["escape_89F"] = AA_level_df.apply(lambda x: 0 if x["wildtype"] == x["mutant"] else x["escape_89F"], axis=1)

# Calculate total escape
AA_level_df["cocktail_total_escape"] = (
    AA_level_df[[
        "escape_121F",
        "escape_372D",
        "escape_89F"
    ]].sum(axis=1, skipna=False)
)

# Create a site level dataframe
site_level_df = (
    AA_level_df.groupby(["site", "wildtype"])
    .aggregate({
        "effect" : "mean",
        "escape_2510C" : "sum",
        "escape_121F" : "sum",
        "escape_377H" : "sum",
        "escape_256A" : "sum",
        "escape_372D" : "sum",
        "escape_89F" : "sum",
        "cocktail_total_escape" : "sum",
        "natural_counts" : "sum",
        "mutation_frequency" : "sum",
    })
    .reset_index()
)

# Calculate total escape averages per site
# for Arevirumab-3
site_level_df["cocktail_avg_site_escape"] = (
    site_level_df[[
        "escape_121F",
        "escape_372D",
        "escape_89F"
    ]].mean(axis=1, skipna=False)
)

# Add site level escape to AA level df
AA_level_df = (
    AA_level_df.merge(
        site_level_df[[
            "site", 
            "wildtype", 
            "escape_2510C",
            "escape_121F",
            "escape_377H",
            "escape_256A",
            "escape_372D",
            "escape_89F",
            "cocktail_total_escape",
            # "avg_site_escape",
            "cocktail_avg_site_escape",
        ]].rename(columns={
            "escape_2510C" : "site_escape_2510C",
            "escape_121F" : "site_escape_121F",
            "escape_377H" : "site_escape_377H",
            "escape_256A" : "site_escape_256A",
            "escape_372D" : "site_escape_372D",
            "escape_89F" : "site_escape_89F",
            "cocktail_total_escape" : "cocktail_total_site_escape",
        }),
        how="left",
        on=["site", "wildtype"],
        validate="many_to_one",
    )
)

# Load data as dataframe
natural_variation = pd.read_csv(natural_sequence_variation)

# Drop individual amino acid counts
natural_variation = natural_variation[["site", "entropy", "n_effective"]]

# Merge functional and natural dataframes
site_level_df = (
    site_level_df.merge(
        natural_variation,
        how="left",
        on=["site"],
        validate="one_to_one",
    )
)

# Mark mutations if in top percentile of summed escape sites
for antibody in ["2510C", "121F", "377H", "256A", "372D", "89F"]:

    # Calculate cutoff 
    cutoff = AA_level_df["site_escape_"+antibody].median() * 10

    # Mark sites that are greater than or equal to the top 5% summed escape site
    AA_level_df["top_site_escape_for_" + antibody] = AA_level_df["site_escape_"+antibody].apply(lambda x: True if x >= cutoff else False)

# Mark validation mutations
for antibody in ["2510C", "121F", "377H", "256A", "372D", "89F"]:
    validation_list = []
    if antibody == "2510C":
        validation_list = ["E228D"]
    if antibody == "121F":
        validation_list = ["N89D"]
    if antibody == "377H":
        validation_list = ["H398K", "D401E"]
    if antibody == "256A":
        validation_list = []
    if antibody == "372D":
        validation_list = ["H398K"]
    if antibody == "89F":
        validation_list = ["K126N"]
    AA_level_df["validation_for_" + antibody] = AA_level_df.apply(lambda x: True if x["wildtype"]+str(x["site"])+x["mutant"] in validation_list else False, axis=1)

# Making two lists for values and colors 
dom = [True, False] 
rng = ["#EE7733FF", "#00000026"] 

subplots = []
for index,antibody in enumerate(antibody_files):
    antibody_name = antibody[0].split("/")[-1].split("_")[0]

    curr_subplot = alt.Chart(AA_level_df, title=antibody_name).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.X(
            "mutation_frequency",
            axis=alt.Axis(
                title="mutation frequency", 
                values=[0, 0.001, 0.01, 0.1, 0.5, 1],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
                format="0.3",
            ),
            scale=alt.Scale(type="symlog", constant=0.0005, domain=[-0.0001,1.1])
        ),
        alt.Y(
            "escape_" + antibody_name,
            axis=alt.Axis(
                title="escape", 
                values=[0,1,2,3,4,5,6],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            scale=alt.Scale(domain=[0,6.1])
        ),
        tooltip=[
            "site",
            "wildtype",
            "mutant",
            "mutation_frequency",
            "effect",
            "escape_" + antibody_name,
        ], 
    ).properties(
        width=250,
        height=150,
    )
    
    subplots.append(curr_subplot)

natural_vs_antibody = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    subplots[3],
    subplots[4],
    subplots[5],
    spacing=5,
    title=["Mutational frequencies vs antibody escape", "for individual amino-acid mutations"],
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

# Make output dir if doesn't exist
if not os.path.exists(out_dir_natural):
    os.mkdir(out_dir_natural)

natural_vs_antibody.save(html_nat_mut_freqs_vs_escape)

natural_vs_antibody

r, p = sp.stats.pearsonr(
    AA_level_df[["cocktail_total_escape","mutation_frequency"]].replace([np.inf, -np.inf], np.nan).dropna()["cocktail_total_escape"],
    AA_level_df[["cocktail_total_escape","mutation_frequency"]].replace([np.inf, -np.inf], np.nan).dropna()["mutation_frequency"],
)
print(f"r correlation of cocktail total escape and natural sequence diversity: {r:.2f}")

natural_vs_antibody = alt.Chart(
    AA_level_df,
    title=["Mutational frequencies vs antibody escape across all Arevirumab-3 antibodies", "for individual amino-acid mutations"]
).mark_point(
    filled=True, 
    color="black", 
    size=75,
    opacity=0.15,
).encode(
    alt.Y(
        "cocktail_total_escape",
        axis=alt.Axis(
            title=["escape summed", "across all Arevirumab-3 antibodies"], 
            # values=[0,1,2,3],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        # scale=alt.Scale(domain=[-0.1, 3.1])
    ),
    alt.X(
        "mutation_frequency",
        axis=alt.Axis(
            title="mutation frequency", 
            values=[0, 0.001, 0.01, 0.1, 0.5, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
            format="0.3",
        ),
        scale=alt.Scale(type="symlog", constant=0.0005, domain=[-0.0001,1.1])
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "mutation_frequency",
        "escape_121F",
        "escape_89F",
        "escape_372D",
        "cocktail_total_escape",
    ], 
).properties(
    width=300,
    height=300,
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

# Make output dir if doesn't exist
if not os.path.exists(out_dir_natural):
    os.mkdir(out_dir_natural)

natural_vs_antibody.save(html_nat_mut_freqs_vs_escape_all_abs)

natural_vs_antibody

r correlation of cocktail total escape and natural sequence diversity: -0.03

# Making two lists for values and colors 
dom = [True, False] 
rng = ["#EE7733FF", "#00000026"] 

subplots = []
for index,antibody in enumerate(antibody_files):
    antibody_name = antibody[0].split("/")[-1].split("_")[0]

    curr_subplot = alt.Chart(site_level_df, title=antibody_name).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.Y(
            "escape_" + antibody_name,
            axis=alt.Axis(
                title="site escape", 
                # values=[0,1,2,3,4,5,6],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            # scale=alt.Scale(domain=[-0.1,6.5])
        ),
        alt.X(
            "mutation_frequency",
            axis=alt.Axis(
                title=["mutation frequencies", "in natural sequences"], 
                values=[0, 0.001, 0.01, 0.1, 0.5, 1],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
                format="0.3",
            ),
            scale=alt.Scale(type="symlog", constant=0.0005, domain=[-0.0001,1.1])
        ),
        tooltip=[
            "site",
            "wildtype",
            "mutation_frequency",
            "escape_" + antibody_name,
            "effect",
        ],
    ).properties(
        width=250,
        height=150,
    )
    
    subplots.append(curr_subplot)

natural_vs_antibody_mut_freq = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    subplots[3],
    subplots[4],
    subplots[5],
    spacing=5,
    title=["Mutational frequencies vs antibody escape", "for summed site escape"],
)

subplots = []
for index,antibody in enumerate(antibody_files):
    antibody_name = antibody[0].split("/")[-1].split("_")[0]

    curr_subplot = alt.Chart(site_level_df, title=antibody_name).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.Y(
            "escape_" + antibody_name,
            axis=alt.Axis(
                title="site escape", 
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            # scale=alt.Scale(domainMin=-1)
        ),
        alt.X(
            "n_effective",
            axis=alt.Axis(
                title=["effective amino acids", "in natural sequences"], 
                values=[1,2,3,4],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            scale=alt.Scale(domain=[0.9,4])
        ),
        tooltip=[
            "site",
            "wildtype",
            "n_effective",
            "escape_" + antibody_name,
            "effect",
        ],
    ).properties(
        width=250,
        height=150,
    )
    
    subplots.append(curr_subplot)

natural_vs_antibody_neff = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    subplots[3],
    subplots[4],
    subplots[5],
    spacing=5,
    title=["Effective amino acids vs antibody escape", "for summed site escape"]
)

combined_plot = alt.vconcat(
    natural_vs_antibody_mut_freq,
    natural_vs_antibody_neff,
    spacing=5,
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

# Make output dir if doesn't exist
if not os.path.exists(out_dir_natural):
    os.mkdir(out_dir_natural)

combined_plot.save(html_natural_vs_escape)

combined_plot

# Making two lists for values and colors 
dom = [True, False] 
rng = ["#EE7733FF", "#00000026"] 

# Calculate statistics
r, p = sp.stats.pearsonr(
    site_level_df[["cocktail_total_escape","mutation_frequency"]].dropna()["cocktail_total_escape"],
    site_level_df[["cocktail_total_escape","mutation_frequency"]].dropna()["mutation_frequency"],
)
print(f"r correlation of site escape summed across all Arevirumab-3 antibodies and site mutational frequency: {r:.2f}")

natural_vs_antibody_mut_freq = alt.Chart(
    site_level_df, 
).mark_point(
    filled=True, 
    color="black", 
    size=75,
    opacity=0.15,
).encode(
    alt.Y(
        "cocktail_total_escape",
        axis=alt.Axis(
            title=["site escape", "summed across all Arevirumab-3 antibodies"],  
            # values=[0,1,2,3],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        # scale=alt.Scale(domain=[-0.1, 3.1])
    ),
    alt.X(
        "mutation_frequency",
        axis=alt.Axis(
            title=["mutation frequencies", "in natural sequences"],
            values=[0, 0.001, 0.01, 0.1, 0.5, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
            format="0.3",
        ),
        scale=alt.Scale(type="symlog", constant=0.0005, domain=[-0.0001,1.1])
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutation_frequency",
        "escape_121F",
        "escape_89F",
        "escape_372D",
        "cocktail_total_escape",
    ],
).properties(
    width=300,
    height=300,
)

# Calculate statistics
r, p = sp.stats.pearsonr(
    site_level_df[["cocktail_avg_site_escape","mutation_frequency"]].dropna()["cocktail_avg_site_escape"],
    site_level_df[["cocktail_avg_site_escape","mutation_frequency"]].dropna()["mutation_frequency"],
)
print(f"r correlation of site escape averaged across all Arevirumab-3 antibodies and site mutational frequency: {r:.2f}")

natural_vs_antibody_avg_mut_freq = alt.Chart(
    site_level_df, 
).mark_point(
    filled=True, 
    color="black", 
    size=75,
    opacity=0.15,
).encode(
    alt.Y(
        "cocktail_avg_site_escape",
        axis=alt.Axis(
            title=["site escape", "averaged across all Arevirumab-3 antibodies"],  
            # values=[0,1,2,3],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        # scale=alt.Scale(domain=[-0.1, 3.1])
    ),
    alt.X(
        "mutation_frequency",
        axis=alt.Axis(
            title=["mutation frequencies", "in natural sequences"],
            values=[0, 0.001, 0.01, 0.1, 0.5, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
            format="0.3",
        ),
        scale=alt.Scale(type="symlog", constant=0.0005, domain=[-0.0001,1.1])
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutation_frequency",
        "escape_121F",
        "escape_89F",
        "escape_372D",
        "cocktail_avg_site_escape",
    ],
).properties(
    width=300,
    height=300,
)


# Calculate statistics
r, p = sp.stats.pearsonr(
    site_level_df[["cocktail_total_escape","n_effective"]].dropna()["cocktail_total_escape"],
    site_level_df[["cocktail_total_escape","n_effective"]].dropna()["n_effective"],
)
print(f"r correlation of site escape summed across all Arevirumab-3 antibodies and site effective amino acids: {r:.2f}")

natural_vs_antibody_neff = alt.Chart(
    site_level_df, 
).mark_point(
    filled=True, 
    color="black", 
    size=75,
    opacity=0.15,
).encode(
    alt.Y(
        "cocktail_total_escape",
        axis=alt.Axis(
            title=["site escape", "summed across all Arevirumab-3 antibodies"],  
            # values=[0,1,2,3],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        # scale=alt.Scale(domain=[-0.1, 3.1])
    ),
    alt.X(
        "n_effective",
        axis=alt.Axis(
            title=["effective amino acids", "in natural sequences"], 
            values=[1,2,3,4],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[1,4])
    ),
    tooltip=[
        "site",
        "wildtype",
        "n_effective",
        "escape_121F",
        "escape_89F",
        "escape_372D",
        "cocktail_total_escape",
    ], 
).properties(
    width=300,
    height=300,
)

# Calculate statistics
r, p = sp.stats.pearsonr(
    site_level_df[["cocktail_avg_site_escape","n_effective"]].dropna()["cocktail_avg_site_escape"],
    site_level_df[["cocktail_avg_site_escape","n_effective"]].dropna()["n_effective"],
)
print(f"r correlation of site escape averaged across all Arevirumab-3 antibodies and site effective amino acids: {r:.2f}")

natural_vs_antibody_avg_neff = alt.Chart(
    site_level_df, 
).mark_point(
    filled=True, 
    color="black", 
    size=75,
    opacity=0.15,
).encode(
    alt.Y(
        "cocktail_avg_site_escape",
        axis=alt.Axis(
            title=["site escape", "averaged across all Arevirumab-3antibodies"],  
            # values=[0,1,2,3],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        # scale=alt.Scale(domain=[-0.1, 3.1])
    ),
    alt.X(
        "n_effective",
        axis=alt.Axis(
            title=["effective amino acids", "in natural sequences"], 
            values=[1,2,3,4],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[1,4])
    ),
    tooltip=[
        "site",
        "wildtype",
        "n_effective",
        "escape_121F",
        "escape_89F",
        "escape_372D",
        "cocktail_avg_site_escape",
    ], 
).properties(
    width=300,
    height=300,
)

combined_plot = alt.hconcat(
    natural_vs_antibody_mut_freq,
    natural_vs_antibody_avg_mut_freq,
    natural_vs_antibody_neff,
    natural_vs_antibody_avg_neff,
    spacing=5,
    title=["Site natural sequence diversity vs site antibody escape", "across all Arevirumab-3 antibodies"]
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

# Make output dir if doesn't exist
if not os.path.exists(out_dir_natural):
    os.mkdir(out_dir_natural)

combined_plot.save(html_natural_vs_escape_all_abs)

combined_plot

r correlation of site escape summed across all Arevirumab-3 antibodies and site mutational frequency: -0.03
r correlation of site escape averaged across all Arevirumab-3 antibodies and site mutational frequency: -0.03
r correlation of site escape summed across all Arevirumab-3 antibodies and site effective amino acids: -0.01
r correlation of site escape averaged across all Arevirumab-3 antibodies and site effective amino acids: -0.02

subplots = []
for i,antibody in enumerate(["escape_372D", "escape_89F", "escape_121F"]):

    # Get names of antibodies to compare
    antibody_name = antibody.split("_")[1]
    other_antibody_name = ["escape_372D", "escape_89F", "escape_121F"][(i+1)%3].split("_")[1]
    if antibody_name == "372D":
        antibody_name = "37.2D"
    elif antibody_name == "89F":
        antibody_name = "8.9F"
    elif antibody_name == "121F":
        antibody_name = "12.1F"
    else:
        print("ERROR!")
    if other_antibody_name == "372D":
        other_antibody_name = "37.2D"
    elif other_antibody_name == "89F":
        other_antibody_name = "8.9F"
    elif other_antibody_name == "121F":
        other_antibody_name = "12.1F"
    else:
        print("ERROR!")

    curr_subplot = alt.Chart(
        AA_level_df, 
        title=f"{antibody_name} vs {other_antibody_name}",
    ).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.Y(
            antibody,
            axis=alt.Axis(
                title=f"{antibody_name} escape",
                values=[0,1,2,3,4,5,6],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            scale=alt.Scale(domain=[-0.2, 6])
        ),
        alt.X(
            ["escape_372D", "escape_89F", "escape_121F"][(i+1)%3],
            axis=alt.Axis(
                title=f"{other_antibody_name} escape", 
                values=[0,1,2,3,4,5,6],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            scale=alt.Scale(domain=[-0.2, 6])
        ),
        tooltip=[
            "site",
            "wildtype",
            "mutant",
            "effect",
            "escape_372D", 
            "escape_89F", 
            "escape_121F"
        ],
    ).properties(
        width=250,
        height=250,
    )
    
    subplots.append(curr_subplot)

combined_plot = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    spacing=5,
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
).configure_view(
    stroke=None
)

combined_plot.save(html_arevirumab_comparisons)

combined_plot

# Mark sites to label and color by antibody class with largest escape
site_level_df["cocktail_site_max"] = (
    site_level_df[[
        "escape_121F",
        "escape_372D",
        "escape_89F",
    ]].max(skipna = False, axis=1)
)
site_level_df["site_epitope"] = (
    site_level_df.apply(lambda x: "none" if (x["cocktail_site_max"] == 0) else
                        ("GPC-A" if (x["escape_2510C"] == x["cocktail_site_max"]) else 
                        ("GPC-B" if (x["escape_377H"] == x["cocktail_site_max"] or x["escape_256A"] == x["cocktail_site_max"] or x["escape_372D"] == x["cocktail_site_max"]) else 
                         ("GPC-C" if (x["escape_89F"] == x["cocktail_site_max"]) else 
                          ("GP1-A" if (x["escape_121F"] == x["cocktail_site_max"]) else "error")))), axis=1)
)
# Color only top escape sites with nonzero mutation frequencies
cutoff = site_level_df["n_effective"].quantile(0.75)
print(f"{cutoff} is the 75% percentile for effective amino acids")
top_escape_sites = site_level_df.query("n_effective >= @cutoff").nlargest(10, "cocktail_avg_site_escape", keep="all")["site"].tolist() 
print(f"The following sites are the top 10 escape mutations among the top 25% percentile of effective amino acids{top_escape_sites}")
site_level_df["site_label"] = (
    site_level_df.apply(lambda x: x["site_epitope"] if (x["site"] in top_escape_sites) else "none", axis=1)
)

# Replace zero frequency with small value for easier plotting on log scale
site_level_df["mutation_frequency"] = site_level_df["mutation_frequency"].replace(0, 0.0003)
AA_level_df["mutation_frequency"] = AA_level_df["mutation_frequency"].replace(0, 0.0002)

1.0446081569907109 is the 75% percentile for effective amino acids
The following sites are the top 10 escape mutations among the top 25% percentile of effective amino acids[125, 150, 127, 248, 111, 116, 132, 113, 126, 152]

# Set figure size and subplots
fig, axes = plt.subplots(
    1, 
    6, 
    figsize=(7.7, 1), 
)

# Adjust spacing of subplots
fig.subplots_adjust(
    bottom=0, 
    top=1, 
    wspace=0.2, 
    hspace=0,
)

for index,antibody in enumerate(["2510C", "121F", "377H", "256A", "372D", "89F"]):
    chart = sns.scatterplot(
        data=AA_level_df,
        x="mutation_frequency",
        y="escape_"+antibody,
        edgecolor=None,
        linewidth=0.5,
        s=15,
        alpha=0.15,
        ax=axes[index],
    )
    chart.set_xscale("log")
    chart.xaxis.set_minor_locator(mticker.NullLocator()) # no minor ticks
    xticks = [0.0002, 0.001, 0.01, 0.1]
    chart.set_xticks(xticks)
    x_labels=["0", " $10^{-3}$", " $10^{-2}$", " $10^{-1}$"]
    chart.set_xticklabels(labels=x_labels, horizontalalignment="center", fontsize=6)

    chart.set_xlim(0.000125,1.1)
    chart.set(xlabel=None)

    if index == 0:
        chart.set_ylabel(ylabel="escape", fontsize=8)
        chart.set_ylim(-0.25, 6.35)
        yticks = [0, 1, 2, 3, 4, 5, 6]
        chart.set_yticks(yticks)
        chart.set_yticklabels(labels=["0", "1", "2", "3", "4", "5", "6"], fontsize=6)
        chart.set_title(
            "25.10C", 
            fontsize=8,
            color="#44AA99",
            horizontalalignment="center",
        )
    if index == 1:
        chart.set(ylabel=None)
        chart.set_ylim(-0.25, 6.35)
        yticks = [0, 1, 2, 3, 4, 5, 6]
        chart.set_yticks(yticks)
        chart.set_yticklabels(labels=["0", "1", "2", "3", "4", "5", "6"], fontsize=6)
        chart.set_title(
            "12.1F", 
            fontsize=8,
            color="#999933",
            horizontalalignment="center",
        )
    if index == 2:
        chart.set(ylabel=None)
        chart.set_ylim(-0.29, 7.29)
        yticks = [0, 1, 2, 3, 4, 5, 6, 7]
        chart.set_yticks(yticks)
        chart.set_yticklabels(labels=["0", "1", "2", "3", "4", "5", "6", "7"], fontsize=6)
        chart.set_title(
            "37.7H", 
            fontsize=8,
            color="#AA4499",
            horizontalalignment="center",
        )
    if index == 3:
        chart.set(ylabel=None)
        chart.set_ylim(-0.29, 7.29)
        yticks = [0, 1, 2, 3, 4, 5, 6, 7]
        chart.set_yticks(yticks)
        chart.set_yticklabels(labels=["0", "1", "2", "3", "4", "5", "6", "7"], fontsize=6)
        chart.set_title(
            "25.6A", 
            fontsize=8,
            color="#AA4499",
            horizontalalignment="center",
        )
    if index == 4:
        chart.set(ylabel=None)
        chart.set_ylim(-0.125, 3.125)
        yticks = [0, 1, 2, 3]
        chart.set_yticks(yticks)
        chart.set_yticklabels(labels=["0", "1", "2", "3"], fontsize=6)
        chart.set_title(
            "37.2D", 
            fontsize=8,
            color="#AA4499",
            horizontalalignment="center",
        )
    if index == 5:
        chart.set(ylabel=None)
        chart.set_ylim(-0.25, 6.35)
        yticks = [0, 1, 2, 3, 4, 5, 6]
        chart.set_yticks(yticks)
        chart.set_yticklabels(labels=["0", "1", "2", "3", "4", "5", "6"], fontsize=6)
        chart.set_title(
            "8.9F", 
            fontsize=8,
            color="#117733",
            horizontalalignment="center",
        )

    # Change all spines
    for axis in ["top", "bottom", "left", "right"]:
        chart.spines[axis].set_linewidth(1)
    chart.tick_params(axis="both", length=3, width=1)
    chart.grid(False)
    sns.despine()

    # Label points on each scatter plot
    for i in range(0, AA_level_df.shape[0]):
        x_pos = AA_level_df.at[i, "mutation_frequency"]
        y_pos = AA_level_df.at[i, "escape_"+antibody]
        name = AA_level_df.at[i, "wildtype"] + str(AA_level_df.at[i, "site"]) + AA_level_df.at[i, "mutant"]
        if antibody == "2510C":
            if name == "E228D":
                chart.text(
                    x_pos,
                    y_pos+0.3,
                    f"{name}",
                    fontsize=6,
                    horizontalalignment="left",
                    color="#000000",
                )
        if antibody == "121F":
            if name == "N89D":
                chart.text(
                    x_pos,
                    y_pos+0.3,
                    f"*{name}",
                    fontsize=6,
                    horizontalalignment="left",
                    color="#000000",
                    style="italic",
                )
        if antibody == "377H":
            if name == "H398K":
                chart.text(
                    x_pos,
                    y_pos+0.3,
                    f"{name}",
                    fontsize=6,
                    horizontalalignment="left",
                    color="#000000",
                )
            if name == "D401E":
                chart.text(
                    x_pos,
                    y_pos+0.3,
                    f"{name}",
                    fontsize=6,
                    horizontalalignment="left",
                    color="#000000",
                )
        if antibody == "256A":
            if name == "H398K":
                chart.text(
                    x_pos,
                    y_pos+0.3,
                    f"{name}",
                    fontsize=6,
                    horizontalalignment="left",
                    color="#000000",
                )
        if antibody == "372D":
            if name == "H398K":
                chart.text(
                    x_pos,
                    y_pos+0.15,
                    f"{name}",
                    fontsize=6,
                    horizontalalignment="left",
                    color="#000000",
                )
        if antibody == "89F":
            if name == "K126N":
                # Annotate text with arrow
                chart.annotate(
                    text=f"{name}",
                    xy=(x_pos, y_pos),
                    xytext=(x_pos+0.0075, y_pos+0.3),
                    arrowprops=dict(
                        arrowstyle="-|>",
                        lw=0.5,
                        relpos=(0.05,0.5),
                        color="#000000",
                    ),
                    bbox=dict(pad=-5, facecolor="none", edgecolor="none"),
                    fontsize=6,
                    horizontalalignment="left",
                    color="#000000",
                )

# Common X axis labels
fig.text(0.5, -0.3, "frequency of amino-acid mutation in natural sequences", ha="center", rotation="horizontal", fontsize=8)

# Save fig
fig.savefig(natural_escape)

Compare DMS data to natural sequence data¶

Identify natural sequences that would potentially escape antibody neutralization¶

Escape profiles for the sites that differ in the chosen isolates from the Josiah DMS strain¶

Pseudovirus neutralization validation assays for chosen natural isolates and corresponding single mutants¶

Further analysis¶

Compare DMS data to natural sequences¶