# this cell is tagged as parameters for `papermill` parameterization
altair_config = None
nipah_config = None

neut = None
escape_file = None

nah1_validation_neut_curves = None
IC50_validation_plot = None
combined_ic50_neut_curve_plot = None

# Parameters
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
neut = "data/custom_analyses_data/experimental_data/nAH1_3_mab_validation_neuts.csv"
escape_file = "results/antibody_escape/averages/nAH1.3_mut_effect.csv"
nah1_validation_neut_curves = "results/images/nah1_validation_neut_curves.html"
IC50_validation_plot = "results/images/IC50_validation_plot.html"
combined_ic50_neut_curve_plot = "results/images/combined_ic50_neut_curve_plot.html"

import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
import neutcurve
import scipy.stats
print(f"Using `neutcurve` version {neutcurve.__version__}")
import sys

# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

# setup working directory
if os.getcwd() == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/":
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

#import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append('data/custom_analyses_data/')
import theme
alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')

Using `neutcurve` version 0.5.7
Setup in correct directory

ThemeRegistry.enable('main_theme')

if nah1_validation_neut_curves is None:
    #altair_config = "data/custom_analyses_data/theme.py"
    nipah_config = "nipah_config.yaml"
    escape_file = "results/antibody_escape/averages/nAH1.3_mut_effect.csv"
    neut = "data/custom_analyses_data/experimental_data/nAH1_3_mab_validation_neuts.csv"

    ephrin_binding_neuts_file = (
        "data/custom_analyses_data/experimental_data/bat_ephrin_neuts.csv"
    )
    ephrin_validation_curves = "data/custom_analyses_data/experimental_data/binding_single_mutant_validations.csv"
    validation_ic50s_file = (
        "data/custom_analyses_data/experimental_data/receptor_IC_validations.csv"
    )
    e2_monomeric_binding_file = (
        "results/receptor_affinity/averages/EFNB2_monomeric_mut_effect.csv"
    )
    e3_dimeric_binding_file = (
        "results/receptor_affinity/averages/EFNB3_dimeric_mut_effect.csv"
    )

with open(nipah_config) as f:
    config = yaml.safe_load(f)

# Estimate neutralization curves using the `curvefits` module from `neutcurve` package.
def get_neutcurve(df, replicate="average"):
    # estimate neut curves
    fits = neutcurve.curvefits.CurveFits(
        data=df,  # Input DataFrame containing the experimental data.
        serum_col="antibody",
        virus_col="virus",
        replicate_col="replicate",
        conc_col="concentration",
        fracinf_col="fraction infectivity",
        fixbottom=0,
    )
    # get neutcurve parameters for different IC values
    fitParams = fits.fitParams(ics=[50, 90, 95, 97, 98, 99])

    # Prepare lists of unique antibodies and viruses for looping.
    antibody_list = list(df["antibody"].unique())
    virus_list = list(df["virus"].unique())

    curve_list = (
        []
    )  # Initialize an empty list to store neutralization curve data frames.
    # Loop through each combination of antibody and virus to get their neutralization curve.
    for antibody in antibody_list:
        for virus in virus_list:
            # get curves and combine
            curve = fits.getCurve(serum=antibody, virus=virus, replicate=replicate)
            neut_df = curve.dataframe()
            neut_df["antibody"] = antibody
            neut_df["virus"] = virus
            curve_list.append(neut_df)

    neut_curve_df = pd.concat(curve_list)
    # Calculate upper and lower bounds for the measurements using standard error (stderr).
    neut_curve_df["upper"] = neut_curve_df["measurement"] + neut_curve_df["stderr"]
    neut_curve_df["lower"] = neut_curve_df["measurement"] - neut_curve_df["stderr"]

    return (
        fitParams,
        neut_curve_df,
    )  # Return the fit parameters and the neut fitted neut curves

# Sorting function to organize mutation strings with 'WT' at the start and the rest numerically.
def custom_sort_order(array):
    # Helper function to extract numerical part from mutation strings.
    def extract_number(virus):
        num = re.search(
            r"\d+", virus
        )  # Find the first sequence of digits in the string.
        return (
            int(num.group()) if num else 0
        )  # Convert digits to integer, or 0 if none found.

    array = sorted(
        array, key=extract_number
    )  # Sort array by the numerical value extracted.

    # Ensure 'WT' (wild type) is the first element in the list if it exists.
    if "WT" in array:
        array.remove("Unmutated")  # Remove 'WT' from its current position.
        array.insert(0, "Unmutated")  # Insert 'WT' at the beginning of the list.
    return array


# Function to plot neutralization curves with customized color coding for each virus mutation.
def plot_neut_curves(df, neut_curve_name, color_name):
    # Manually define colors corresponding to the category10 color scheme plus black.
    category10_colors = [
        "#4E79A5",
        "#F18F3B",
        "#E0585B",
        "#77B7B2",
        "#5AA155",
        "#EDC958",
        "#AF7AA0",
        "#FE9EA8",
        "#9C7561",
        "#BAB0AC",
    ]

    # Assign colors to mutations, starting with black for the first mutation.
    colors = ["black"] + category10_colors[: len(df["virus"].unique()) - 1]
    # Construct a line chart for the neutralization curves.
    chart = (
        alt.Chart(df)
        .mark_line(size=1.5, opacity=1)
        .encode(
            x=alt.X(
                "concentration:Q",
                scale=alt.Scale(type="log"),
                axis=alt.Axis(format=".0e", tickCount=3),
                title=neut_curve_name,
            ),
            y=alt.Y("fit:Q", title="Fraction Infectivity", axis=alt.Axis(tickCount=3)),
            color=alt.Color(
                "virus",
                title=color_name,
                scale=alt.Scale(
                    domain=custom_sort_order(df["virus"].unique()), range=colors
                ),
            ),
        )
        .properties(
            height=config["neut_curve_height"],
            width=config["neut_curve_width"],
        )
    )
    # Add data points as circles on the plot for actual measurements.
    circle = (
        alt.Chart(df)
        .mark_circle(size=40, opacity=1)
        .encode(
            x=alt.X(
                "concentration",
                scale=alt.Scale(type="log"),
                axis=alt.Axis(format=".0e", tickCount=3),
                title=neut_curve_name,
            ),
            y=alt.Y(
                "measurement:Q",
                title="Fraction Infectivity",
                axis=alt.Axis(tickCount=3),
            ),
            color=alt.Color(
                "virus",
                title=color_name,
                scale=alt.Scale(
                    domain=custom_sort_order(df["virus"].unique()), range=colors
                ),
            ),
        )
        .properties(
            height=config["neut_curve_height"],
            width=config["neut_curve_width"],
        )
    )
    # Add error bars
    error = (
        alt.Chart(df)
        .mark_errorbar(opacity=1)
        .encode(
            x="concentration",
            y=alt.Y("lower", title="Fraction Infectivity"),
            y2="upper",
            color="virus",
        )
    )
    plot = (
        chart + circle + error
    )  # Combine line chart, circles, and error bars into one plot.
    return plot

def plot_ic50_correlations(fitparams, name, escape):
    # Merge IC50 and DMS escape dataframes and append WT so it has escape score of 0
    fitparams["lower_bound"] = fitparams["ic50_bound"].apply(lambda x: x == "lower")
    fitparams["mutation"] = fitparams["virus"]
    # Merge with DMS escape data
    merged = fitparams.merge(escape, on=["mutation"])
    wt_rows = fitparams[fitparams["mutation"] == "Unmutated"].copy()
    wt_rows["escape_mean"] = 0
    merged = pd.concat([merged, wt_rows], ignore_index=True)
    display(merged)
    # calculate R value:
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        merged["escape_mean"], merged["ic50"]
    )

    # Define the category10 colors manually
    category10_colors = [
        "#4E79A5",
        "#F18F3B",
        "#E0585B",
        "#77B7B2",
        "#5AA155",
        "#EDC958",
        "#AF7AA0",
        "#FE9EA8",
        "#9C7561",
        "#BAB0AC",
    ]

    # Adjust colors based on the unique mutations
    colors = ["black"] + category10_colors[: len(merged["mutation"].unique()) - 1]

    corr_chart = (
        alt.Chart(merged)
        .mark_point(size=150,filled=True,opacity=1)
        .encode(
            x=alt.X(
                "escape_mean", title="Antibody escape from DMS", axis=alt.Axis(grid=False,tickCount=3)
            ),
            y=alt.Y(
                "ic50",
                title=f"{name}",
                scale=alt.Scale(type="log"),
                axis=alt.Axis(grid=False,tickCount=4),
            ),
            color=alt.Color(
                "mutation",
                title="Virus",
                scale=alt.Scale(
                    domain=custom_sort_order(merged["mutation"].unique()), range=colors
                ),
            ),
            #shape=alt.Shape("lower_bound", title="Lower Bound"),
        )
    )
    text = (
        alt.Chart(
            {
                "values": [
                    {
                        "x": merged["escape_median"].min(),
                        "y": merged["ic50"].max(),
                        "text": f"r = {r_value:.2f}",
                    }
                ]
            }
        )
        .mark_text(align="left", baseline="top", dx=-10)  # Adjust this for position
        .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
    )
    final_chart = corr_chart + text
    return final_chart

# Function to calculate IC50 correlations and generate plots for neutralization curves and IC50 correlations.
def get_neut_curve_ic50_correlations(
    raw_data, escape, neut_curve_name, name, color_name
):
    # Process raw data to separate fitted curve data and neutralization curve data.
    fit_df, neut_df = get_neutcurve(raw_data)

    # Convert IC50 values from µg to ng/mL for better readability.
    fit_df["ic50_ng"] = fit_df["ic50"] * 1000
    # Display IC50 values for each serum-virus combination, rounded for clarity.
    print("Here are the ic50 values:")
    display(fit_df[["serum", "virus", "ic50_ng"]].round(2))

    # Generate a plot of the neutralization curves using the provided data and color scheme.
    neut_curves_plot = plot_neut_curves(neut_df, neut_curve_name, color_name)

    # Create a plot showing correlations between IC50 values and a specified variable (e.g., escape).
    ic50_correlations = plot_ic50_correlations(fit_df, name, escape)

    # Combine the neutralization curve plot and the IC50 correlation plot side by side.
    combined_neut_curve = neut_curves_plot | ic50_correlations

    # Return the individual plots as well as the combined plot for further use or display.
    return neut_curves_plot, ic50_correlations, combined_neut_curve

# Load escape mutation data from a CSV file
escape = pd.read_csv(escape_file)
# Load neutralization data from another CSV file
neuts = pd.read_csv(neut)

# Get rid of Y455M because its not present in escape data
neuts = neuts[neuts["virus"] != "Y455M"]


def fix_neut_df(df):
    df = df[["serum", "virus", "replicate", "concentration", "fraction infectivity"]]
    df = df.rename(columns={"serum": "antibody"})
    return df


# Fix naming in dataframe
nAH1_validation_neuts = fix_neut_df(neuts)

# Calculate neut curve, ic50 correlations with DMS data, and combine figures
nAH1_curve_plot, nAH1_ic50_correlations, nAH1_combined = (
    get_neut_curve_ic50_correlations(
        nAH1_validation_neuts,  # Single mutant validations
        escape,  # Filtered DMS escape file
        "nAH1.3 Concentration (μg/ml)",  # Title for the neutralization curve
        "Validation IC₅₀ (μg/ml)",  # Title for the IC50 correlation plot
        "Virus",  # Color legend title
    )
)

nAH1_curve_plot.display()
if combined_ic50_neut_curve_plot is not None:
    nAH1_curve_plot.save(nah1_validation_neut_curves)

nAH1_ic50_correlations.display()
if combined_ic50_neut_curve_plot is not None:
    nAH1_ic50_correlations.save(IC50_validation_plot)

nAH1_combined.display()
if combined_ic50_neut_curve_plot is not None:
    nAH1_combined.save(combined_ic50_neut_curve_plot)

Here are the ic50 values:

	serum	virus	ic50_ng
0	nAH1.3	L184H	10.80
1	nAH1.3	Q450S	6.54
2	nAH1.3	W519T	7.92
3	nAH1.3	I520T	8725.69
4	nAH1.3	D468V	487.70
5	nAH1.3	I517K	10000.00
6	nAH1.3	P185D	10000.00
7	nAH1.3	Unmutated	6.83

	serum	virus	replicate	nreplicates	ic50	ic50_bound	ic50_str	ic90	ic90_bound	ic90_str	...	mutant	escape_mean	escape_median	escape_std	n_models	times_seen	frac_models	LibA-230725-nAH1.3	LibB-230630-nAH1.3	LibB-230720-nAH1.3
0	nAH1.3	L184H	average	2	0.010802	interpolated	0.0108	0.043326	interpolated	0.0433	...	H	0.3322	0.3080	0.24800	3.0	4.667	1.0	0.3080	0.09721	0.5915
1	nAH1.3	Q450S	average	2	0.006540	interpolated	0.00654	0.032273	interpolated	0.0323	...	S	0.3267	0.3306	0.04523	3.0	3.333	1.0	0.2797	0.33060	0.3699
2	nAH1.3	W519T	average	2	0.007923	interpolated	0.00792	0.026042	interpolated	0.026	...	T	0.2210	0.2147	0.05903	3.0	10.000	1.0	0.2147	0.28290	0.1654
3	nAH1.3	I520T	average	2	8.725689	interpolated	8.73	10.000000	lower	>10	...	T	1.2640	1.2170	0.60140	3.0	8.000	1.0	0.6875	1.21700	1.8880
4	nAH1.3	D468V	average	2	0.487700	interpolated	0.488	10.000000	lower	>10	...	V	1.8670	1.5540	1.13600	3.0	6.667	1.0	0.9211	1.55400	3.1270
5	nAH1.3	I517K	average	2	10.000000	lower	>10	10.000000	lower	>10	...	K	2.1900	2.5660	1.12800	3.0	7.667	1.0	0.9221	2.56600	3.0810
6	nAH1.3	P185D	average	2	10.000000	lower	>10	10.000000	lower	>10	...	D	1.9620	1.8280	1.23900	3.0	7.333	1.0	0.7950	1.82800	3.2620
7	nAH1.3	Unmutated	average	2	0.006834	interpolated	0.00683	0.019276	interpolated	0.0193	...	NaN	0.0000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

IC50_validations¶

For running interactively¶

Read in config files¶

Function for getting fitting neut curve from experimental data¶

Make neut curve plot¶

Now calculate correlations with DMS data and plot¶

Combine all the data from above in single function¶

Do for antibody validations¶