# this cell is tagged as parameters for `papermill` parameterization
altair_config = None
BLI_corr_plot = None

binding_E2_file = None
binding_E3_file = None

# Parameters
altair_config = "data/custom_analyses_data/theme.py"
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
BLI_corr_plot = "results/images/binding_BLI_corr.html"

import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
import sys

# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

# setup working directory
if os.getcwd() == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/":
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

#import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append('data/custom_analyses_data/')
import theme
alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')

Setup in correct directory

ThemeRegistry.enable('main_theme')

##hard paths in case don't want to run with snakemake
if BLI_corr_plot is None:
    print("loading hard paths")
    nipah_config = "nipah_config.yaml"
    
    # input files
    binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
    binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"

# import binding data
df_E2_filter = pd.read_csv(binding_E2_file)
display(df_E2_filter.head(3))
df_E3_filter = pd.read_csv(binding_E3_file)
display(df_E3_filter.head(3))

### load BLI data and merge with binding from DMS
BLI_df = pd.read_csv('data/custom_analyses_data/experimental_data/BLI_data.csv')

#merge
BLI_w_E2 = pd.merge(BLI_df,df_E2_filter[['site','wildtype','mutant','binding_mean']],on=['site','mutant'],how='left')
df = pd.merge(BLI_w_E2,df_E3_filter[['site','wildtype','mutant','binding_mean']],on=['site','mutant','wildtype'],how='left',suffixes=['_E2','_E3'])

#Make mutation column
df['mutation'] = (df['wildtype'].astype(str) + df['site'].astype(str) + df['mutant'].astype(str)).astype(object)
df.drop(['site','mutant','wildtype'],axis=1,inplace=True)

#Add unmutated information
new_row = {'B2_1': 0, 'B2_2': 0, 'B3_1': 0, 'B3_2': 0, 'binding_mean_E2': 0, 'binding_mean_E3': 0, 'mutation': 'Unmutated'}
new_df = pd.DataFrame([new_row])

#combine
df = pd.concat([df,new_df])
display(df)

df['B2_mean'] = df[['B2_1', 'B2_2']].mean(axis=1)
df['B3_mean'] = df[['B3_1', 'B3_2']].mean(axis=1)
df['B2_std'] = df[['B2_1','B2_2']].std(axis=1)
df['B3_std'] = df[['B3_1','B3_2']].std(axis=1)
df['B2_upper'] = df['B2_mean'] + df['B2_std']
df['B2_lower'] = df['B2_mean'] - df['B2_std']
df['B3_upper'] = df['B3_mean'] + df['B3_std']
df['B3_lower'] = df['B3_mean'] - df['B3_std']

display(df)

##### calculate R value:
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    df["B2_mean"], df["binding_mean_E2"]
)
r_value_E2 = float(r_value)
print(r_value_E2)

slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
    df["B3_mean"], df["binding_mean_E3"]
)
r_value_E3 = float(r_value)
print(r_value_E3)

0.9763779669852755
0.9057068049246291

# Sorting function to put 'WT' on top of the legend, followed by numerical order
def custom_sort_order(array):
    # Sort based on the numerical part in mutation strings, e.g., '530' in 'Q530F'
    def extract_number(mutation):
        num = re.search(r"\d+", mutation)
        return int(num.group()) if num else 0

    array = sorted(array, key=extract_number)

    # Move 'WT' to the beginning of the list
    if "Unmutated" in array:
        array.remove("Unmutated")
        array.insert(0, "Unmutated")
    return array


# Define the category10 colors manually
category10_colors = ["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949","#af7aa1","#ff9da7","#9c755f","#bab0ab"]

# Adjust colors based on the unique mutations
colors = ["black"] + category10_colors[: len(df["mutation"].unique()) - 1]

# Create the Altair chart
corr_chart = alt.Chart(df, title=alt.Title("bEFNB2")).mark_point(size=125,filled=True,opacity=1).encode(
        x=alt.X(
            "binding_mean_E2:Q",
            title="DMS Binding",
            #scale=alt.Scale(domain=[-4, 2]),
            axis=alt.Axis(tickCount=3),
        ),
        y=alt.Y(
            "B2_mean",
            title="Binding measured by BLI",
            #scale=alt.Scale(type="log", base=10),
            axis=alt.Axis(tickCount=4),
            #    format=".0e", tickCount=4
            #),  # Display in scientific notation
        ),
        color=alt.Color(
            "mutation",
            title="Variant",
            scale=alt.Scale(
                domain=custom_sort_order(df["mutation"].unique()), range=colors
            ),
        ),
)

min_effect_E2 = int(df["binding_mean_E2"].min())
max_mean_luciferase_E2 = int(df["B2_mean"].max())

text = (
    alt.Chart(
        {
            "values": [
                {
                    "x": min_effect_E2,
                    "y": max_mean_luciferase_E2,
                    "text": f"r = {r_value_E2:.2f}",
                }
            ]
        }
    )
    .mark_text(
        align="left",
        baseline="top",
        dx=-10,  # Adjust this for position
        dy=-20,  # Adjust this for position
        
    )
    .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
error = (
        alt.Chart(df)
        .mark_errorbar(opacity=1)
        .encode(
            x="binding_mean_E2",
            y=alt.Y("B2_lower",title='Binding measured by BLI'),
            y2="B2_upper",
            color="mutation",
        )
)
# Vertical line at x=0
vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(x='x:Q')
# Horizontal line at y=0
hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(y='y:Q')

final_chart_E2 = vline + hline + corr_chart  + error + text
final_chart = final_chart_E2.properties(height=200,width=200)
final_chart

# Create the Altair chart
corr_chart_E3 = alt.Chart(df, title=alt.Title("bEFNB3")).mark_point(size=125,filled=True,opacity=1).encode(
        x=alt.X(
            "binding_mean_E3:Q",
            title="DMS Binding",
            scale=alt.Scale(domain=[-1, 1.5]),
            axis=alt.Axis(tickCount=3),
        ),
        y=alt.Y(
            "B3_mean",
            title="Binding measured by BLI",
            #scale=alt.Scale(type="log", base=10),
            axis=alt.Axis(tickCount=4),
            #    format=".0e", tickCount=4
            #),  # Display in scientific notation
        ),
        color=alt.Color(
            "mutation",
            title="Variant",
            scale=alt.Scale(
                domain=custom_sort_order(df["mutation"].unique()), range=colors
            ),
        ),
)
min_effect_E2 = int(df["binding_mean_E3"].min())
max_mean_luciferase_E2 = int(df["B3_mean"].max())

text_E3 = (
    alt.Chart(
        {
            "values": [
                {
                    "x": min_effect_E2,
                    "y": max_mean_luciferase_E2,
                    "text": f"r = {r_value_E3:.2f}",
                }
            ]
        }
    )
    .mark_text(
        align="left",
        baseline="top",
        dx=-70,  # Adjust this for position
        dy=0,  # Adjust this for position
        
    )
    .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)

error = (
        alt.Chart(df)
        .mark_errorbar(opacity=1)
        .encode(
            x="binding_mean_E3",
            y=alt.Y("B3_lower", title="Binding measured by BLI"),
            y2="B3_upper",
            color="mutation",
        )
)

# Vertical line at x=0
vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(x='x:Q')
# Horizontal line at y=0
hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(y='y:Q')

final_chart_E3 = vline + hline + corr_chart_E3  + error + text_E3 
final_chart_E3 = final_chart_E3.properties(height=200,width=200)
final_chart_E3

combined = alt.hconcat(final_chart_E2, final_chart_E3)#.resolve_scale(y='shared')
combined.display()
combined.save(BLI_corr_plot)

	site	wildtype	mutant	binding_mean	binding_std	times_seen_binding	effect	effect_std	times_seen_cell_entry	frac_models	mutant_type
0	71	Q	D	-0.7817	1.6460	3.25	-1.164	0.8890	4.500	1.0	Negative
1	71	Q	E	0.1659	0.4309	3.50	-1.255	0.3123	5.375	1.0	Negative
2	71	Q	F	-0.3429	0.5299	3.00	-1.058	0.6637	4.625	1.0	Aromatic

	site	wildtype	mutant	binding_mean	binding_std	times_seen_binding	effect	effect_std	times_seen_cell_entry	frac_models	mutant_type
0	71	Q	C	-0.1332	0.1525	3.0	-0.7227	0.7828	3.000	1.0	Special
1	71	Q	D	-0.2937	0.3305	4.0	-0.3884	0.6369	3.429	1.0	Negative
2	71	Q	E	-0.3527	0.1762	4.0	-0.2482	0.9791	4.571	1.0	Negative

	B2_1	B2_2	B3_1	B3_2	binding_mean_E2	binding_mean_E3	mutation
0	1.562500	3.339844	-5.419813	-6.871482	0.09378	-0.29390	V244W
1	-1.929012	-0.527344	-37.849845	-34.380863	-0.07562	-0.14020	L305W
2	-3.587963	-1.328125	146.112839	155.863039	-0.21260	1.26100	Q492L
3	-1.581790	3.066406	44.757885	35.905253	0.67810	0.19680	V507I
4	-25.810185	-26.523438	-24.855620	-20.333021	-4.01500	0.08437	Q530F
5	7.793210	11.992188	56.352732	80.886492	1.94500	0.99430	S553W
6	-6.732253	-5.996094	-20.746335	-21.998124	-1.43500	-0.61140	D555Y
7	3.993056	4.238281	42.536650	23.006567	1.19500	0.28830	Q559R
8	-8.738426	-7.382812	-45.335406	-53.939962	-1.82300	-0.29660	I588V
0	0.000000	0.000000	0.000000	0.000000	0.00000	0.00000	Unmutated

	B2_1	B2_2	B3_1	B3_2	binding_mean_E2	binding_mean_E3	mutation	B2_mean	B3_mean	B2_std	B3_std	B2_upper	B2_lower	B3_upper	B3_lower
0	1.562500	3.339844	-5.419813	-6.871482	0.09378	-0.29390	V244W	2.451172	-6.145648	1.256772	1.026485	3.707944	1.194400	-5.119163	-7.172133
1	-1.929012	-0.527344	-37.849845	-34.380863	-0.07562	-0.14020	L305W	-1.228178	-36.115354	0.991129	2.452940	-0.237049	-2.219307	-33.662413	-38.568294
2	-3.587963	-1.328125	146.112839	155.863039	-0.21260	1.26100	Q492L	-2.458044	150.987939	1.597947	6.894433	-0.860097	-4.055991	157.882372	144.093506
3	-1.581790	3.066406	44.757885	35.905253	0.67810	0.19680	V507I	0.742308	40.331569	3.286771	6.259756	4.029079	-2.544463	46.591326	34.071813
4	-25.810185	-26.523438	-24.855620	-20.333021	-4.01500	0.08437	Q530F	-26.166812	-22.594320	0.504346	3.197960	-25.662465	-26.671158	-19.396360	-25.792281
5	7.793210	11.992188	56.352732	80.886492	1.94500	0.99430	S553W	9.892699	68.619612	2.969126	17.347988	12.861824	6.923573	85.967600	51.271624
6	-6.732253	-5.996094	-20.746335	-21.998124	-1.43500	-0.61140	D555Y	-6.364173	-21.372229	0.520543	0.885148	-5.843630	-6.884717	-20.487081	-22.257378
7	3.993056	4.238281	42.536650	23.006567	1.19500	0.28830	Q559R	4.115668	32.771608	0.173401	13.809855	4.289069	3.942268	46.581463	18.961754
8	-8.738426	-7.382812	-45.335406	-53.939962	-1.82300	-0.29660	I588V	-8.060619	-49.637684	0.958563	6.084340	-7.102056	-9.019183	-43.553345	-55.722024
0	0.000000	0.000000	0.000000	0.000000	0.00000	0.00000	Unmutated	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000

plot_BLI_data.ipynb¶