# this cell is tagged as parameters for `papermill` parameterization
altair_config = None
nipah_config = None
validation_file_E2 = None
validation_file_E3 = None

func_scores_E2_file = None
func_scores_E3_file = None

func_score_E2_plot = None
func_score_E3_plot = None
corr_plots_combined = None

# Parameters
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
validation_file_E2 = (
    "data/custom_analyses_data/experimental_data/functional_validations_EFNB2.csv"
)
validation_file_E3 = (
    "data/custom_analyses_data/experimental_data/functional_validations_EFNB3.csv"
)
func_scores_E2_file = "results/func_effects/averages/CHO_bEFNB2_func_effects.csv"
func_scores_E3_file = "results/func_effects/averages/CHO_bEFNB3_func_effects.csv"
func_score_E2_plot = "results/images/func_score_E2_validation.html"
func_score_E3_plot = "results/images/func_score_E3_validation.html"
corr_plots_combined = "results/images/corr_plots_combined.html"

import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
import sys

# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

# setup working directory
if os.getcwd() == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/":
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

#import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append('data/custom_analyses_data/')
import theme
alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')

Setup in correct directory

ThemeRegistry.enable('main_theme')

with open("config.yaml") as f:
    config = yaml.safe_load(f)

if corr_plots_combined is None:
    nipah_config = "nipah_config.yaml"
    validation_file_E2 = (
        "data/custom_analyses_data/experimental_data/functional_validations_EFNB2.csv"
    )
    validation_file_E3 = (
        "data/custom_analyses_data/experimental_data/functional_validations_EFNB3.csv"
    )
    func_scores_E2_file = "results/func_effects/averages/CHO_bEFNB2_func_effects.csv"
    func_scores_E3_file = "results/func_effects/averages/CHO_bEFNB3_func_effects.csv"

with open(nipah_config) as f:
    config = yaml.safe_load(f)

func_validations_EFNB2 = pd.read_csv(validation_file_E2, na_filter=None)
func_validations_EFNB2 = func_validations_EFNB2.rename(
    columns={"mean_luciferase": "mean_luciferase_E2"}
)
func_validations_EFNB3 = pd.read_csv(validation_file_E3, na_filter=None)
func_validations_EFNB3 = func_validations_EFNB3.rename(
    columns={"mean_luciferase": "mean_luciferase_E3"}
)
func_validations_EFNB3 = func_validations_EFNB3.drop("mutation", axis=1)
concat = pd.concat([func_validations_EFNB2, func_validations_EFNB3], axis=1)
display(concat.head(30))

func_scores = pd.read_csv(func_scores_E2_file)
func_scores["mutation"] = (
    func_scores["wildtype"] + func_scores["site"].astype(str) + func_scores["mutant"]
)

func_scores_E3 = pd.read_csv(func_scores_E3_file)
func_scores_E3["mutation"] = (
    func_scores_E3["wildtype"]
    + func_scores_E3["site"].astype(str)
    + func_scores_E3["mutant"]
)

func_scores_merged = func_scores.merge(
    func_scores_E3, on=["mutation"], how="left", suffixes=["_E2", "_E3"]
)
merged = concat.merge(func_scores_merged, on=["mutation"], how="left")
# Change effect of WT to very small number other than 0 so can plot on log scale
merged.loc[merged["mutation"] == "Unmutated", ["effect_E2", "effect_E3"]] = 0.0000001

##### calculate R value:
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    merged["effect_E2"], merged["mean_luciferase_E2"]
)
r_value = float(r_value)


# Sorting function to put 'WT' on top of the legend, followed by numerical order
def custom_sort_order(array):
    # Sort based on the numerical part in mutation strings, e.g., '530' in 'Q530F'
    def extract_number(mutation):
        num = re.search(r"\d+", mutation)
        return int(num.group()) if num else 0

    array = sorted(array, key=extract_number)

    # Move 'WT' to the beginning of the list
    if "Unmutated" in array:
        array.remove("Unmutated")
        array.insert(0, "Unmutated")
    return array


# Define the category10 colors manually
category10_colors = ["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949","#af7aa1","#ff9da7","#9c755f","#bab0ab"]
# Adjust colors based on the unique mutations
colors = ["black"] + category10_colors[: len(merged["mutation"].unique()) - 1]

# Create the Altair chart
corr_chart = (
    alt.Chart(merged, title=alt.Title("CHO-bEFNB2"))
    .encode(
        x=alt.X(
            "effect_E2:Q",
            title="Cell entry in DMS",
            scale=alt.Scale(domain=[-4, 1]),
            axis=alt.Axis(values=[-4, -3, -2, -1, 0, 1], tickCount=6),
        ),
        y=alt.Y(
            "mean_luciferase_E2",
            title="RLU/μL in validation asasy",
            scale=alt.Scale(type="log", base=10),
            axis=alt.Axis(
                format=".0e", tickCount=4
            ),  
        ),
        color=alt.Color(
            "mutation",
            title="Virus",
            scale=alt.Scale(
                domain=custom_sort_order(merged["mutation"].unique()), range=colors
            ),
        ),
        shape=alt.Shape('mutation',scale=alt.Scale(domain=custom_sort_order(merged["mutation"].unique()))),
        tooltip=["mutation", "effect_E2", "mean_luciferase_E2"],
    )
    .mark_point(size=125,filled=True,opacity=0.5)
)

min_effect_E2 = int(merged["effect_E2"].min())
max_mean_luciferase_E2 = int(merged["mean_luciferase_E2"].max())

text = (
    alt.Chart(
        {
            "values": [
                {
                    "x": min_effect_E2,
                    "y": max_mean_luciferase_E2,
                    "text": f"r = {r_value:.2f}",
                }
            ]
        }
    )
    .mark_text(
        align="left",
        baseline="top",
        dx=-30,  
        dy=-30,  
        
    )
    .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
# text
final_chart = corr_chart + text
final_chart = final_chart.properties(height=200,width=200).resolve_scale(shape='independent', color='independent')

func_score_E2_chart = final_chart
func_score_E2_chart.display()
if corr_plots_combined is not None:
    func_score_E2_chart.save(func_score_E2_plot)

# calculate R value:
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    merged["effect_E3"], merged["mean_luciferase_E3"]
)
r_value = float(r_value)


# Sorting function to put 'WT' on top of the legend, followed by numerical order
def custom_sort_order(array):
    # Sort based on the numerical part in mutation strings
    def extract_number(mutation):
        num = re.search(r"\d+", mutation)
        return int(num.group()) if num else 0

    array = sorted(array, key=extract_number)

    # Move 'WT' to the beginning of the list
    if "WT" in array:
        array.remove("WT")
        array.insert(0, "WT")
    return array

# Adjust colors based on the unique mutations
colors = ["black"] + category10_colors[: len(merged["mutation"].unique()) - 1]

# Create the Altair chart
corr_chart = (
    alt.Chart(merged, title=alt.Title("CHO-bEFNB3"))
    .encode(
        x=alt.X(
            "effect_E3:Q",
            title="Cell entry in DMS",
            scale=alt.Scale(domain=[-4, 1]),
            axis=alt.Axis(values=[-4, -3, -2, -1, 0, 1], tickCount=6),
        ),
        y=alt.Y(
            "mean_luciferase_E3",
            title="RLU/μL in validation asasy",
            scale=alt.Scale(type="log", base=10),
            axis=alt.Axis(
                format=".0e", tickCount=4
            ),  # Display in scientific notation
        ),
        color=alt.Color(
            "mutation",
            title="Virus",
            scale=alt.Scale(
                domain=custom_sort_order(merged["mutation"].unique()), range=colors
            ),
            
        ),
        tooltip=["mutation", "effect_E3", "mean_luciferase_E3"],
        shape=alt.Shape('mutation',scale=alt.Scale(domain=custom_sort_order(merged["mutation"].unique()))),
    )
    .mark_point(size=125,filled=True,opacity=0.5)
)

min_effect_E3 = int(merged["effect_E3"].min())
max_mean_luciferase_E3 = int(merged["mean_luciferase_E3"].max())

text = (
    alt.Chart(
        {
            "values": [
                {
                    "x": min_effect_E3,
                    "y": max_mean_luciferase_E3,
                    "text": f"r = {r_value:.2f}",
                }
            ]
        }
    )
    .mark_text(
        align="left",
        baseline="top",
        dx=-30,  # Adjust this for position
        dy=-15,  # Adjust this for position
        
    )
    .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)

final_chart = (corr_chart + text).resolve_scale(shape='independent', color='independent')
final_chart = final_chart.properties(width=200,height=200)
func_score_E3_chart = final_chart
func_score_E3_chart.display()
if corr_plots_combined is not None:
    func_score_E3_chart.save(func_score_E3_plot)

(func_score_E2_chart | func_score_E3_chart).resolve_scale(color='shared').display()
# Combine the plots
if corr_plots_combined is not None:
    (func_score_E2_chart | func_score_E3_chart).save(corr_plots_combined)

	mutation	mean_luciferase_E2	mean_luciferase_E3
0	Q530F	13839	39157
1	Q530F	11669	38929
2	Q530F	12583	32625
3	Y389T	9223	18
4	Y389T	9564	21
5	Y389T	6327	30
6	Q530E	8952	3
7	Q530E	4825	2
8	Q530E	7865	1
9	P488S	12841	2012
10	P488S	1023	155
11	P488S	9621	1672
12	Q530L	13695	5601
13	Q530L	13589	5730
14	Q530L	11817	5136
15	C162L	1	30
16	C162L	4	9
17	C162L	1	20
18	F168A	7771	24956
19	F168A	9860	29080
20	F168A	6493	23138
21	Unmutated	8775	24502
22	Unmutated	9407	23416
23	Unmutated	9936	32370
24	Q492L	19403	42769
25	Q492L	10533	36135
26	Q492L	11757	46212

cell_entry_validations.ipynb¶

Papermill parameters¶

Import packages, set working directory, import altair theme¶

Read config file¶

For running interactively¶

Load in config files¶

Import luciferase (RLUs/uL) readings for each mutant¶

Now import func scores and make new column to match above data frame to merge on¶

Now Plot Correlations¶

CHO-EFNB2 Entry Correlations¶

CHO-EFNB3 entry correlations plot¶