# this cell is tagged as parameters for `papermill` parameterization
#input configs
altair_config = None
nipah_config = None

#E2 specific files
func_scores_E2_file = None
binding_E2_file = None
#E3 specific files
func_scores_E3_file = None
binding_E3_file = None

#merged_files
merged_df_file = None
concat_df_file = None

#output plots
output_corr = None
entry_binding_corr_plot_E2_output = None
entry_binding_corr_plot_E3_output = None
corr_entry_binding_large_output = None
combined_binding_output = None
entry_by_site_plot_e2_output = None
entry_by_site_plot_e3_output = None
entry_by_site_plot_e2_bar_plot = None
binding_letter_plot = None
entry_letter_plot = None
entry_letter_plot_slider = None
binding_letter_plot_slider = None
entry_by_site_plot_e3_bar_plot = None

# Parameters
altair_config = "data/custom_analyses_data/interactive_theme.py"
nipah_config = "nipah_config.yaml"
func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
merged_df_file = "results/filtered_data/entry/e2_e3_entry_filter_merged.csv"
concat_df_file = "results/filtered_data/entry/e2_e3_entry_filter_concat.csv"
output_corr = "results/images/corr_heatmap.html"
entry_binding_corr_plot_E2_output = "results/images/entry_binding_corr_plot_E2.html"
entry_binding_corr_plot_E3_output = "results/images/entry_binding_corr_plot_E3.html"
corr_entry_binding_large_output = "results/images/corr_entry_binding_large.html"
combined_binding_output = "results/images/combined_binding.html"
entry_by_site_plot_e2_output = "results/images/entry_by_site_plot_e2.html"
entry_by_site_plot_e3_output = "results/images/entry_by_site_plot_e3.html"
entry_by_site_plot_e2_bar_plot = "results/images/entry_by_site_plot_e2_bar_plot.html"
binding_letter_plot = "results/images/binding_letter_plot.html"
entry_letter_plot = "results/images/entry_letter_plot.html"
entry_letter_plot_slider = "results/images/entry_letter_plot_slider.html"
binding_letter_plot_slider = "results/images/binding_letter_plot_slider.html"
entry_by_site_plot_e3_bar_plot = "results/images/entry_by_site_plot_e3_bar_plot.html"

import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
import sys

# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

# setup working directory
if os.getcwd() == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/":
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

#import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append('data/custom_analyses_data/')
import interactive_theme
alt.themes.register('interactive_theme', interactive_theme.interactive_theme)
alt.themes.enable('interactive_theme')

Setup in correct directory

ThemeRegistry.enable('interactive_theme')

if nipah_config is None:
    #input files
    #altair_config = 'data/custom_analyses_data/interactive_theme.py'
    nipah_config = 'nipah_config.yaml'
    
    func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
    binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
    
    func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
    binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
    
    antibody_file = 'results/filtered_data/escape/mab_filter_concat.csv'
    merged_df_file = 'results/filtered_data/entry/e2_e3_entry_filter_merged.csv'
    concat_df_file = 'results/filtered_data/entry/e2_e3_entry_filter_concat.csv'

with open(nipah_config) as f:
    config = yaml.safe_load(f)

merged_df = pd.read_csv(merged_df_file) #merged entry scores 
display(merged_df.head(5))
concat_df = pd.read_csv(concat_df_file) #concat entry scores
display(concat_df.head(5))

# Read filtered cell entry data
def read_func_data(file,name):
    effect_df = pd.read_csv(file)
    effect_df = effect_df[['site','wildtype','mutant','effect','mutant_type']]
    effect_df['cell_type'] = name
    return effect_df

# Call func to read in cell entry data
e2_func_df = read_func_data(func_scores_E2_file, 'CHO-bEFNB2')
e3_func_df = read_func_data(func_scores_E3_file, 'CHO-bEFNB3')

# Read filtered binding data
def read_binding_data(file,name):
    binding_df = pd.read_csv(file)
    binding_df = binding_df[['site','wildtype','mutant','binding_mean','mutant_type']]
    binding_df['cell_type'] = name
    return binding_df

# Call func to read in binding data
e2_bind_df = read_binding_data(binding_E2_file,'CHO-bEFNB2')
e3_bind_df = read_binding_data(binding_E3_file,'CHO-bEFNB3')

# Concat binding and func data, then merge
def concat_dfs(bind1,bind2,entry1,entry2):
    combo_bind_df = pd.concat([bind1,bind2])
    combo_entry_df = pd.concat([entry1,entry2])
    total_merged = pd.merge(combo_bind_df,combo_entry_df,on=['site','wildtype','mutant','cell_type','mutant_type'],how='outer')
    return total_merged

final_merged_df = concat_dfs(e2_bind_df,e3_bind_df,e2_func_df,e3_func_df)

### Ok, now I have different inputs ready to go for plotting. Lets review
# I have my different entry dataframes
display(final_merged_df)

def find_domain(df):
    barrel_ranges = {
        "Stalk": list(range(70, 148)),
        "Neck": list(range(148, 166)),
        "Linker": list(range(166, 178)),
        "Head": list(range(178, 603)),
    }
    agg_means = []
    # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = df[df["site"].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append(
                {
                    "site": row["site"],
                    "wildtype": row["wildtype"],
                    "mutant": row["mutant"],
                    "region": barrel,
                    "binding_mean": row["binding_mean"],
                    "effect": row['effect'],
                    "cell_type": row["cell_type"],
                    "mutant_type": row["mutant_type"]
                }
            )
        agg_means_df = pd.DataFrame(agg_means)
    return agg_means_df
# Call function above
binding_entry_concat_df = find_domain(final_merged_df)
display(binding_entry_concat_df)

### Make a dataframe with the averaged values of mutants at each site
tmp_df = binding_entry_concat_df.groupby(['site','cell_type'])[['effect','binding_mean']].mean().reset_index()
subset_df = binding_entry_concat_df.drop_duplicates(['site','wildtype','region'])

mean_df = pd.merge(tmp_df,subset_df[['site','wildtype','region']],on='site',how='left')
display(mean_df.head(5))

#Now make a pivot table for some graphs
df_pivot = binding_entry_concat_df.pivot_table(index=['region', 'site', 'wildtype','mutant','mutant_type'], 
                          columns='cell_type', 
                          values=['effect', 'binding_mean'],
                          aggfunc='first').reset_index()

# Flatten df
df_pivot.columns = ['_'.join(col).strip() if col[1] else col[0] for col in df_pivot.columns.values]
# rename
df_pivot.rename(columns={
    'effect_CHO-bEFNB2': 'effect_E2',
    'effect_CHO-bEFNB3': 'effect_E3',
    'binding_mean_CHO-bEFNB2': 'binding_E2',
    'binding_mean_CHO-bEFNB3': 'binding_E3'
}, inplace=True)

display(df_pivot.sort_values(by='site').head(5))

def correlation_heatmap(df):
    chart = (
        alt.Chart(df,title=alt.Title('Effects of RBP Mutations on Cell Entry',subtitle='Between CHO cells expressing bat EFNB2 or EFNB3'))
        .mark_rect()
        .encode(
            alt.X("effect_E2", title="RBP mutant entry in CHO-bEFNB2",axis=alt.Axis(values=[-4,-3,-2,-1,0,1])).bin(maxbins=75),
            alt.Y("effect_E3", title="RBP mutant entry in CHO-bEFNB3",axis=alt.Axis(values=[-4,-3,-2,-1,0,1])).bin(maxbins=75),
            alt.Color('count():Q',title='Count').scale(type='log'),
            tooltip=['count()'],
        )
    ).properties(
    height=300,
    width=300,
).configure_legend(
    padding=2,
    orient='top-left', #"left", "right", "top", "bottom", "top-left", "top-right", "bottom-left", "bottom-right", "none"
    labelFontSize=16,
    titlePadding=2,
    symbolSize=100,
)

    return chart
corr_heatmap = correlation_heatmap(merged_df)
corr_heatmap.display()
if entry_by_site_plot_e3_output is not None:
    corr_heatmap.save(output_corr)

def plot_entry_binding_interactive(df,name):
    #find contact sites
    df_copy = df.copy()
    df_copy.loc[:, 'is_contact'] = df_copy['site'].isin(config['contact_sites'])
    
    # make a brush for interactivity
    brush = alt.selection_interval() 
    
    #scatter plot
    chart = alt.Chart(df_copy).mark_point(filled=True,size=50).encode(
            alt.X("effect", title="Cell Entry", axis=alt.Axis(values=[-2,-1,0,1])),
            alt.Y("binding_mean", title="Binding", axis=alt.Axis(values=[-4,-2,0,2])),
            color=alt.condition(brush, 'is_contact', alt.value('lightgray')),  
            tooltip=["site", "wildtype", "mutant", "binding_mean","effect"]  
    ).add_params(
        brush
    ).properties(
        width=400, 
        height=400
    )  
    
    # Create a bar chart showing the sum of binding_median values for the top 10 sites filtered by the selection.
    bars_binding = alt.Chart(df_copy).transform_filter(
        brush  # Apply the selection filter to include only selected data.
    ).transform_aggregate(
        binding_aggr='sum(binding_mean)',  # Aggregate data by summing up binding for selected sites
        groupby=['site', 'is_contact']
    ).transform_window(
        rank='rank(binding_aggr)',  # Rank sites based on the aggregated sum.
        sort=[alt.SortField('binding_aggr', order='descending')]  # Sort by descending order of sum.
    ).transform_filter(
        alt.datum.rank <= 10  # Filter to keep only the top 10 ranked sites.
    ).mark_bar().encode(
        alt.X('site:N', sort='-y', title='Site', axis=alt.Axis(labelAngle=-90)),  # Encode site names on x-axis with custom sorting and label angle.
        alt.Y('binding_aggr:Q', title='Binding'),  # Encode aggregated sum on y-axis.
        color=alt.Color('is_contact', title='Receptor Contact Site')  # Color bars based on whether they are contact sites.
    ).properties(
        width=200, 
        height=50
    )  
    
    # Similar to the bars chart for binding_median, but aggregates and ranks sites based on the 'effect' value.
    bars_effect = alt.Chart(df_copy,title='Top 10').transform_filter(
        brush
    ).transform_aggregate(
        effect_aggr='sum(effect)',
        groupby=['site', 'is_contact']
    ).transform_window(
        rank='rank(effect_aggr)',
        sort=[alt.SortField('effect_aggr', order='descending')]
    ).transform_filter(
        alt.datum.rank <= 10
    ).mark_bar().encode(
        x=alt.X('site:N', sort='-y', title='Site', axis=alt.Axis(labelAngle=-90)),
        y=alt.Y('effect_aggr:Q', title='Entry'),
        color=alt.Color('is_contact', title='Receptor Contact Site')
    ).properties(
        width=200, 
        height=50
    )
    
    # Combine the scatter plot with the two bar charts (stacked vertically and placed side by side).
    combined_chart = chart & (bars_effect | bars_binding)
    combined_chart = combined_chart.properties(
        title={
            "text": f"Correlation of Cell Entry and Binding for {name}", 
            "subtitle": ["Draw box in scatterplot to show the top 10 sites by summed binding or entry"],
        }
    )
    return combined_chart

entry_binding_corr_plot_E2 = plot_entry_binding_interactive(final_merged_df.query('cell_type == "CHO-bEFNB2"'),'bEFNB2')
entry_binding_corr_plot_E2.display()
if entry_by_site_plot_e3_output is not None:
    entry_binding_corr_plot_E2.save(entry_binding_corr_plot_E2_output)

entry_binding_corr_plot_E3 = plot_entry_binding_interactive(final_merged_df.query('cell_type == "CHO-bEFNB3"'),'bEFNB3')
entry_binding_corr_plot_E3.display()
if entry_by_site_plot_e3_output is not None:
    entry_binding_corr_plot_E3.save(entry_binding_corr_plot_E3_output)

display(df_pivot)

def correlation_plot(df):
    df = df.round(2)
    
    #setup interactivity
    variant_selector = alt.selection_point(
        on="mouseover", empty=False, nearest=True, fields=["site","mutant"], value=1
    )
    options = ['Stalk', 'Neck', 'Linker','Head']
    labels = [option + ' ' for option in options]
    
    input_dropdown = alt.binding_radio(
        options=options + [None],
        labels=labels + ['All'],
        name='Region: '
    )
    selection = alt.selection_point(
        fields=['region'],
        bind=input_dropdown,
    )    
    color = alt.condition(
        selection,
        alt.Color('region:N',scale=alt.Scale(domain=options),title='Region'),
        alt.value('lightgray'),
    )
    opacity = alt.condition(
        selection,
        alt.value(1),
        alt.value(0.5)
    )
    
    #make effect chart
    effect_chart = (
        alt.Chart(df,title='Cell entry')
        .mark_point(size=30,opacity=1,filled=True)
        .encode(
            alt.X("effect_E2", title="Entry in CHO-bEFNB2",axis=alt.Axis(tickCount=4)),
            alt.Y("effect_E3", title="Entry in CHO-bEFNB3",axis=alt.Axis(tickCount=4)),
            tooltip=['wildtype',"site", "mutant",'effect_E2','effect_E3'],
            opacity=opacity,
            color=color,
        )
    ).properties(
        height=300,
        width=300
    )
    #make binding chart
    binding_chart = (
        alt.Chart(df,title='Receptor Binding')
        .mark_point(size=30,opacity=1,filled=True)
        .encode(
            alt.X("binding_E2", title="bEFNB2 Binding",axis=alt.Axis(tickCount=4)),
            alt.Y("binding_E3", title="bEFNB3 Binding",axis=alt.Axis(tickCount=4)),
            tooltip=['wildtype',"site", "mutant",'effect_E2','effect_E3','binding_E2','binding_E3'],
            color=color,
            opacity=opacity,
        )
    ).properties(
        height=300,
        width=300
    )
    #combine charts
    combined_chart = effect_chart | binding_chart
    combined_chart=combined_chart.add_params(selection).properties(
        title=alt.Title('Correlations of RBP Mutations Between bEFNB2 and bEFNB3 for Entry and Binding', offset=30,
                        subtitle=['Select radio button to see mutants highlighted or hover over points to see more information'])
    )
    return combined_chart

# Call function above and save
corr_entry_binding_large = correlation_plot(df_pivot)
corr_entry_binding_large.display()
if entry_by_site_plot_e3_output is not None:
    corr_entry_binding_large.save(corr_entry_binding_large_output)

def make_custom_binding_figure(df,name):
    brush = alt.selection_interval() #define selection brush
    custom_order = ["Stalk", "Neck", "Linker", "Head"]

    chart = alt.Chart(df,title=alt.Title(f'{name}')).mark_point(opacity=0.3,filled=True).encode(
            alt.X(
                "binding_mean",
                title=f"Binding",
                axis=alt.Axis(tickCount=4),
            ),
            alt.Y(
                "region:O",
                sort=custom_order,
                title="RBP Region",
            ),
            
            yOffset="random:Q",
            tooltip=["region", "binding_mean", "site", "mutant"],
            color=alt.condition(brush, 'region', alt.value('lightgray')),
        ).transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())").properties(height=200,width=300).add_params(brush)
    
    bars = alt.Chart(df).transform_filter(
            brush
        ).transform_aggregate(
            binding_aggr='sum(binding_mean)',
            groupby=['site', 'region']
        ).transform_window(
            rank='rank(binding_aggr)',
            sort=[alt.SortField('binding_aggr', order='descending')]
        ).transform_filter(
            alt.datum.rank <= 10
        ).mark_bar().encode(
            y=alt.Y('binding_aggr:Q',title='Binding'),
            x=alt.X('site:N', sort='-y',title='Site'),
            color=alt.Color('region',title='Region')
        ).properties(height=50,width=300)
    combined_chart = chart & bars 
    combined_chart
    return combined_chart

#make individual bEFNB2 plot
efnb2_binding_region = make_custom_binding_figure(binding_entry_concat_df.query('cell_type == "CHO-bEFNB2"'),'bEFNB2')

#make individual bEFNB3 plot
efnb3_binding_region = make_custom_binding_figure(binding_entry_concat_df.query('cell_type == "CHO-bEFNB3"'),'bEFNB3')

# combine the plots
combined_binding = (efnb2_binding_region | efnb3_binding_region).properties(
    title=alt.Title('Effects of RBP Mutations on Receptor Binding', offset=30,
    subtitle='Draw boxes on scatter plots to see the top sites by summed binding')
)

combined_binding.display()
if entry_by_site_plot_e3_output is not None:
    combined_binding.save(combined_binding_output)

def entry_by_site(df):
    df = df.round(2)
    custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend

    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=["site"], value=1
    )
    
    chart = (
        alt.Chart(df)
        .mark_bar(opacity=1,stroke='black')
        .encode(
            alt.X("site:N", title='Site',axis=alt.Axis(labelAngle=-90,values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
            alt.Y("effect", title="Mean entry"),
            tooltip=['wildtype',"site", "effect","region"],
            color=alt.Color('region',sort=custom_order,title='Region'),
            strokeWidth=alt.condition(
                    variant_selector, alt.value(1), alt.value(0)
                ),
            row=alt.Row('cell_type',title=None,header=alt.Header(labelFontSize=16,labelFontWeight='bold'))
        )
    ).properties(
        width=800,
        height=150
    )
    combined_chart = chart.properties(
        title=alt.Title('Average Cell Entry of Mutants at Each Site',
        subtitle=['Hover mouse over bars to view information about cell entry'])
    ).add_params(variant_selector)
    
    return combined_chart

entry_by_site_plot = entry_by_site(mean_df)
entry_by_site_plot.display()
#entry_by_site_plot.save('results/images/entry_by_site_plot.html')

def binding_by_site(df):
    #Site 500 is filtered out so I need to add a dummy value so it shows up on axis:
    data = {
        'site': [500],
        'cell_type': ['CHO-bEFNB2'],
        'effect': [0],
        'binding_mean': [0],
        'wildtype': [None],  # Use None for missing values, or '' for an empty string
        'region': ['Head']
    }
    # Create DataFrame
    dummy_data = pd.DataFrame(data)
    df = pd.concat([df, dummy_data])

    
    df = df.round(2)
    custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend

    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=["site"], value=1
    )
    
    chart = (
        alt.Chart(df)
        .mark_bar(opacity=1,stroke='black')
        .encode(
            alt.X("site:N", title='Site',axis=alt.Axis(labelAngle=-90,values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
            alt.Y("binding_mean", title="Mean binding"),
            tooltip=['wildtype',"site", "effect","region"],
            color=alt.Color('region',sort=custom_order,title='Region'),
            strokeWidth=alt.condition(
                    variant_selector, alt.value(1), alt.value(0)
                ),
            row=alt.Row('cell_type',title=None,header=alt.Header(labelFontSize=16,labelFontWeight='bold'))
        )
    ).properties(
        width=800,
        height=150
    )
    combined_chart = chart.properties(
        title=alt.Title('Average Receptor Binding of Mutants at Each Site',
        subtitle=['Hover mouse over bars to view information about cell entry'])
    ).add_params(variant_selector).resolve_scale(y='independent')
    
    return combined_chart

#call function above
entry_by_site_plot = binding_by_site(mean_df)
entry_by_site_plot.display()
#entry_by_site_plot.save('results/images/entry_by_site_plot.html')

def entry_by_site(df,name,effect):
    # make an empty dataframe with every possible mutation so empty values still get plotted
    amino_acid_order = ["R","K","H","D","E","Q","N","S","T","Y","W","F","A","I","L","M","V","G","P","C"]
    sites = range(71, 603)
    data = [{"site": site, "mutant": aa} for site in sites for aa in amino_acid_order]
    empty_df = pd.DataFrame(data)
    full_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
 
    #setup interactive features
    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=["site"], value=1
    )
    
    #make base chart
    base = alt.Chart(full_df).add_params(variant_selector)
    #add bar chart of cell entry by site
    chart = base.mark_bar(opacity=1,stroke='black').encode(
            alt.X("site:N", title='Site',axis=alt.Axis(values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
            alt.Y(f"mean({effect})", title="Mean entry"),
            tooltip=["site", "wildtype","region"],
            opacity=alt.condition(variant_selector,alt.value(1),alt.value(0.7)),
            strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
            color=alt.Color('region',title='Region')
    ).properties(
        width=800,
        height=200
    )

    #add heatmap below showing effect of mutants on cell entry
    select_bar = alt.Chart(full_df).mark_bar(stroke='black').encode(
        alt.X('mutant:N',title=None,scale=alt.Scale(domain=amino_acid_order)),
        color=alt.Color(f'{effect}',legend=alt.Legend(orient='right',direction='horizontal',titleAlign='center',titleAnchor='middle'),title='Cell entry',scale=alt.Scale(scheme='redblue',domainMid=0,domain=[-4,2]))
    ).transform_filter(
        variant_selector
    ).properties(
        width=400,
        height=10
    )

    #make heatmap have an x for wildtype residue
    select_bar_wildtype = alt.Chart(full_df).mark_text(color="black", text="X", size=10, align="center", baseline="middle").encode(
        alt.X('wildtype:N',title='Amino acid'),
    ).transform_filter(
        variant_selector
    ).transform_filter(
        (alt.datum[effect] != None) #filter out empty data
    ).properties(
        width=400,
        height=10
    )
    # combine heatmap plots
    combined_effects_w_wildtype = alt.layer(select_bar_wildtype,select_bar).resolve_scale(x='shared')

    # combine the bar and heatmaps
    combined_chart = alt.vconcat(chart,combined_effects_w_wildtype).resolve_scale(y='independent',x='independent')
    combined_chart = combined_chart.properties(
        title=alt.Title(f'Entry in {name}', offset=30,
        subtitle=['Hover over sites to see information about entry of specific mutations'])
    )
    return combined_chart

# call chart function
entry_by_site_plot_e2 = entry_by_site(df_pivot,'CHO-bEFNB2','effect_E2')
entry_by_site_plot_e2.display()
if entry_by_site_plot_e3_output is not None:
    entry_by_site_plot_e2.save(entry_by_site_plot_e2_output)

# call chart function
entry_by_site_plot_e3 = entry_by_site(df_pivot,'CHO-bEFNB3','effect_E3')
entry_by_site_plot_e3.display()
if entry_by_site_plot_e3_output is not None:
    entry_by_site_plot_e3.save(entry_by_site_plot_e3_output)

def entry_by_site_test(df,name,effect):
    # make an empty dataframe with every possible mutation so empty values still get plotted
    amino_acid_order = ["R","K","H","D","E","Q","N","S","T","Y","W","F","A","I","L","M","V","G","P","C"]
    sites = range(71, 603)
    data = [{"site": site, "mutant": aa} for site in sites for aa in amino_acid_order]
    empty_df = pd.DataFrame(data)
    full_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
 
    #setup interactive features
    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=["site"], value=1
    )
    
    #make base chart
    base = alt.Chart(full_df).add_params(variant_selector)
    #add bar chart of cell entry by site
    chart = base.mark_bar(opacity=1,stroke='black').encode(
            alt.X("site:N", title='Site',axis=alt.Axis(values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
            alt.Y(f"mean({effect})", title="Mean cell entry"),
            tooltip=["site", "wildtype","region"],
            opacity=alt.condition(variant_selector,alt.value(1),alt.value(1)),
            strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
            color=alt.Color('region',title='Region')
    ).properties(
        width=800,
        height=150
    )

    select_bar = alt.Chart(full_df).mark_bar(stroke='black', color='darkgray').encode(
        alt.X('mutant:N', title=None, scale=alt.Scale(domain=amino_acid_order)),  # Removed sort='-y' here
        alt.Y(f'{effect}', scale=alt.Scale(domain=[-4, 2]), title='Cell entry'),
        alt.Color(f'{effect}',title='Effect', legend=None,scale=alt.Scale(scheme='redblue',domainMid=0,domain=[-4,2])),
    ).transform_filter(
        variant_selector
    ).transform_window(
        rank='rank()',
        sort=[alt.SortField(f'{effect}', order='descending')]  # Sort based on the effect values in descending order
    ).properties(
        width=400,
        height=150
    )

    #make heatmap have an x for wildtype residue
    select_bar_wildtype = alt.Chart(full_df).mark_text(color="black", text="X", size=14, align="center", baseline="middle",dy=-20).encode(
        alt.X('wildtype:N',title='Amino acid'),
    ).transform_filter(
        variant_selector
    ).transform_filter(
        (alt.datum[effect] != None) #filter out empty data
    ).properties(
        width=400,
        height=150
    )
    
    #combine heatmap plots
    combined_effects_w_wildtype = alt.layer(select_bar_wildtype,select_bar).resolve_scale(x='shared')

    #combine the bar and heatmaps
    combined_chart = alt.vconcat(chart,combined_effects_w_wildtype).resolve_scale(y='independent',x='independent')
    combined_chart = combined_chart.properties(
        title=alt.Title(f'Cell Entry in {name}', offset=30,
        subtitle=['Effects of RBP mutations on cell entry','Hover over sites to see information about specific mutations'])
    )
    return combined_chart

# Call functions to plot
entry_by_site_plot_e2_bar = entry_by_site_test(df_pivot,'CHO-bEFNB2','effect_E2')
entry_by_site_plot_e3_bar = entry_by_site_test(df_pivot,'CHO-bEFNB3','effect_E3')

# Visualize plots
entry_by_site_plot_e2_bar.display()
entry_by_site_plot_e3_bar.display()

#Save plots if papermill is running
if entry_by_site_plot_e3_output is not None:
    entry_by_site_plot_e2_bar.save(entry_by_site_plot_e2_bar_plot)
    entry_by_site_plot_e3_bar.save(entry_by_site_plot_e3_bar_plot)

def make_effect_by_site_with_hover_tooltip(df):
    tmp_df = df.groupby(['cell_type','site'])['effect'].mean().reset_index().round(2)
    
    # Create a selection that chooses the nearest point & selects based on x-value
    nearest = alt.selection_point(nearest=True, on='mouseover',
                            fields=['site'], empty=False)
    
    # The basic line
    line = alt.Chart(tmp_df).mark_line(interpolate='basis',size=1).encode(
        alt.X('site:Q', title='Site',axis=alt.Axis(values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600])),
        alt.Y('effect:Q',title='Mean entry'),
        color=alt.Color('cell_type:N',title='Cell type')
    )
    
    # Transparent selectors across the chart. This is what tells us
    # the x-value of the cursor
    selectors = alt.Chart(tmp_df).mark_point().encode(
        alt.X('site:Q'),
        opacity=alt.value(0),
    ).add_params(
        nearest
    )
    # Draw points on the line, and highlight based on selection
    points = line.mark_point().encode(
        opacity=alt.condition(nearest, alt.value(1), alt.value(0))
    )
    
    # Draw text labels near the points, and highlight based on selection
    text = line.mark_text(align='left', dx=5, dy=-5,fontSize=15).encode(
        text=alt.condition(nearest, 'effect:Q', alt.value(' ')),
        #color=alt.value('black')
    )#.transform_filter(alt.datum.cell_type == 'CHO-EFNB2')
    
    # Draw a rule at the location of the selection
    rules = alt.Chart(tmp_df).mark_rule(color='gray').encode(
        x='site:Q',
    ).transform_filter(
        nearest
    )
    # Put the five layers into a chart and bind the data
    combined_chart = alt.layer(
        line, selectors, points, rules, text
    ).properties(
        width=800, height=200
    )
    return combined_chart

alt_plot = make_effect_by_site_with_hover_tooltip(binding_entry_concat_df)
alt_plot.display()

def plot_affinity_individual_mutants(df):
    df = df.round(2).copy()

    #setup interactive features
    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=['site'], value=1
    )

    selector = alt.selection_point(
        name="SelectorName",
        fields=['site'],
        bind=alt.binding_range(min=185,max=602,step=1,name='Site'),
        value=[{'site': 492}]
    )

    show_site_mutant = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=['site','mutant'], value=1
    )
    
    # make left side
    all_sites_scatter = (alt.Chart(df)
        .mark_point(filled=True,stroke='black',size=50)
        .encode(
            alt.X("binding_E2", title=("bEFNB2 binding"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4.5,2])),
            alt.Y("binding_E3", title=("bEFNB3 binding"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-2.5,2])),
            tooltip=['site','wildtype','binding_E2','binding_E3','effect_E2','effect_E3'],
            opacity=alt.condition(variant_selector,alt.value(1),alt.value(0.2)),
            color=alt.condition(variant_selector,alt.value('#af7aa1'),alt.value('gray')),
            strokeWidth=alt.condition(variant_selector,alt.value(1),alt.value(0))
        )
        .add_params(variant_selector)
        .properties(
            height=300,
            width=300
        )
    )

    # make amino acid letters
    chart = (alt.Chart(df)
        .mark_text(size=20)
        .encode(
            alt.X("binding_E2", title=("bEFNB2 binding"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4.5,2])),
            alt.Y("binding_E3", title=("bEFNB3 binding"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-2.5,2])),
            alt.Text('mutant'),
            alt.Color('mutant_type',title='Mutant type',scale=alt.Scale(
                    domain=['Aromatic', 'Hydrophilic', 'Hydrophobic','Negative', 'Positive', 'Special'],
                    range=["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949"])),
            tooltip=['site','wildtype','binding_E2','binding_E3','effect_E2','effect_E3'], 
            
        )
        .add_params(variant_selector)
        .transform_filter(variant_selector)
        .properties(
            height=300,
            width=300
        )
    )
    # Vertical line at x=0
    vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(x='x:Q')
    # Horizontal line at y=0
    hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(y='y:Q')
    # Combine the amino acid letter plot with the vertical and horizontal lines
    final_chart = vline + hline + chart
    final_scatter = vline + hline + all_sites_scatter 
    combined = alt.hconcat(final_scatter,final_chart,spacing=20).properties(title=alt.Title('Effects of RBP mutations on binding to bEFNB2 and bEFNB3',subtitle='Hover over points to see individual amino acids'))
    return combined

test_plot = plot_affinity_individual_mutants(df_pivot)
test_plot.display()
if entry_by_site_plot_e3_output is not None:
    test_plot.save(binding_letter_plot)

def plot_affinity_individual_mutants(df):
    df = df.round(2).copy()

    #setup interactive features
    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=['site'], value=1
    )

    selector = alt.selection_point(
        name="SelectorName",
        fields=['site'],
        bind=alt.binding_range(min=185,max=602,step=1,name='Site'),
        value=[{'site': 492}]
    )
    
    # make left side
    all_sites_scatter = (alt.Chart(df)
        .mark_point(filled=True,stroke='black',size=50)
        .encode(
            alt.X("effect_E2", title=("Entry in CHO-bEFNB2"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4,1])),
            alt.Y("effect_E3", title=("Entry in CHO-bEFNB3"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4,1])),
            tooltip=['site','wildtype','binding_E2','binding_E3','effect_E2','effect_E3'],
            opacity=alt.condition(variant_selector,alt.value(1),alt.value(0.2)),
            color=alt.condition(variant_selector,alt.value('#af7aa1'),alt.value('gray')),
            strokeWidth=alt.condition(variant_selector,alt.value(1),alt.value(0))
        )
        .add_params(variant_selector)
        .properties(
            height=300,
            width=300
        )
    )

    # make amino acid letters
    chart = (alt.Chart(df)
        .mark_text(size=20)
        .encode(
            alt.X("effect_E2", title=("Entry in CHO-bEFNB2"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4,1])),
            alt.Y("effect_E3", title=("Entry in CHO-bEFNB3"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4,1])),
            alt.Text('mutant'),
            alt.Color('mutant_type',title='Mutant type',scale=alt.Scale(
                    domain=['Aromatic', 'Hydrophilic', 'Hydrophobic','Negative', 'Positive', 'Special'],
                    range=["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949"])),
            tooltip=['site','wildtype','binding_E2','binding_E3','effect_E2','effect_E3'],  
        )
        .add_params(variant_selector)
        .transform_filter(variant_selector)
        .properties(
            height=300,
            width=300
        )
    )
    # Vertical line at x=0
    vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(x='x:Q')
    # Horizontal line at y=0
    hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(y='y:Q')
    # Combine the amino acid letter plot with the vertical and horizontal lines
    final_chart = vline + hline + chart
    final_scatter = vline + hline + all_sites_scatter 
    combined = alt.hconcat(final_scatter,final_chart,spacing=20).properties(title=alt.Title('Effects of RBP mutations on cell entry',subtitle='Hover over points to see individual amino acids'))
    return combined

test_plot = plot_affinity_individual_mutants(df_pivot)
test_plot.display()
if entry_by_site_plot_e3_output is not None:
    test_plot.save(entry_letter_plot)

def plot_entry_slider(df):
    df = df.round(2).copy()

    #setup interactive features
    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=['site'], value=1
    )

    selector = alt.selection_point(
        name="SelectorName",
        fields=['site'],
        bind=alt.binding_range(min=71,max=602,step=1,name='Site '),
        value=[{'site': 71}]
    )

    # make amino acid letters
    chart = (alt.Chart(df)
        .mark_text(size=20)
        .encode(
            alt.X("effect_E2", title=("Entry in CHO-bEFNB2"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4,1])),
            alt.Y("effect_E3", title=("Entry in CHO-bEFNB3"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4,1])),
            alt.Text('mutant'),
            alt.Color('mutant_type',title='Mutant type',scale=alt.Scale(
                    domain=['Aromatic', 'Hydrophilic', 'Hydrophobic','Negative', 'Positive', 'Special'],
                    range=["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949"])),
            tooltip=['site','wildtype','effect_E2','effect_E3'],  
        )
        .add_params(variant_selector,selector)
        .transform_filter(selector)
        .properties(
            height=300,
            width=300,
            title=alt.Title('Effects of RBP mutations on cell entry',offset=30,subtitle='Use slider to see individual mutations at each site')
        )
    )
    # Vertical line at x=0
    vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=0.5,strokeDash=[2,4]).encode(x='x:Q')
    # Horizontal line at y=0
    hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=0.5,strokeDash=[2,4]).encode(y='y:Q')
    # Combine the amino acid letter plot with the vertical and horizontal lines
    final_chart = vline + hline + chart
    return final_chart

entry_slider_plot = plot_entry_slider(df_pivot)
entry_slider_plot.display()
if entry_by_site_plot_e3_output is not None:
    entry_slider_plot.save(entry_letter_plot_slider)

def plot_affinity_slider(df):
    df = df.round(2).copy()

    #setup interactive features
    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=['site'], value=1
    )
    selector = alt.selection_point(
        name="SelectorName",
        fields=['site'],
        bind=alt.binding_range(min=71,max=602,step=1,name='Site '),
        value=[{'site': 71}]
    )

    # make amino acid letters
    chart = (alt.Chart(df)
        .mark_text(size=20)
        .encode(
            alt.X("binding_E2", title=("bEFNB2 binding"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-5,2])),
            alt.Y("binding_E3", title=("bEFNB3 binding"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-2.25,2])),
            alt.Text('mutant'),
            alt.Color('mutant_type',title='Mutant type',scale=alt.Scale(
                    domain=['Aromatic', 'Hydrophilic', 'Hydrophobic','Negative', 'Positive', 'Special'],
                    range=["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949"])),
            tooltip=['site','wildtype','binding_E2','binding_E3'],  
        )
        .add_params(variant_selector,selector)
        .transform_filter(selector)
        .properties(
            height=300,
            width=300,
            title=alt.Title('Effects of RBP mutations on receptor binding',offset=30,subtitle='Use slider to see individual mutations at each site')
        )
    )
    # Vertical line at x=0
    vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=0.5,strokeDash=[2,4]).encode(x='x:Q')
    # Horizontal line at y=0
    hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=0.5,strokeDash=[2,4]).encode(y='y:Q')
    # Combine the amino acid letter plot with the vertical and horizontal lines
    final_chart = vline + hline + chart
    return final_chart

binding_slider_plot = plot_affinity_slider(df_pivot)
binding_slider_plot.display()
if entry_by_site_plot_e3_output is not None:
    binding_slider_plot.save(binding_letter_plot_slider)

	site	wildtype	mutant	effect_E2	effect_std_E2	times_seen_E2	n_selections_E2	cell_type_E2	wildtype_site_E2	wt_type_E2	mutant_type_E2	effect_E3	effect_std_E3	times_seen_E3	n_selections_E3	cell_type_E3	wildtype_site_E3	wt_type_E3	mutant_type_E3
0	71	Q	C	-1.750	0.1777	4.625	8.0	CHO-bEFNB2	Q71	Hydrophilic	Special	-0.7227	0.7828	3.000	7.0	CHO-bEFNB3	Q71	Hydrophilic	Special
1	71	Q	D	-1.164	0.8890	4.500	8.0	CHO-bEFNB2	Q71	Hydrophilic	Negative	-0.3884	0.6369	3.429	7.0	CHO-bEFNB3	Q71	Hydrophilic	Negative
2	71	Q	E	-1.255	0.3123	5.375	8.0	CHO-bEFNB2	Q71	Hydrophilic	Negative	-0.2482	0.9791	4.571	7.0	CHO-bEFNB3	Q71	Hydrophilic	Negative
3	71	Q	F	-1.058	0.6637	4.625	8.0	CHO-bEFNB2	Q71	Hydrophilic	Aromatic	-0.4973	0.3080	3.286	7.0	CHO-bEFNB3	Q71	Hydrophilic	Aromatic
4	71	Q	G	-1.425	0.5878	7.875	8.0	CHO-bEFNB2	Q71	Hydrophilic	Special	-1.3310	0.8316	4.714	7.0	CHO-bEFNB3	Q71	Hydrophilic	Special

	site	wildtype	mutant	binding_mean	mutant_type	cell_type	effect
0	71	Q	D	-0.78170	Negative	CHO-bEFNB2	-1.16400
1	71	Q	E	0.16590	Negative	CHO-bEFNB2	-1.25500
2	71	Q	F	-0.34290	Aromatic	CHO-bEFNB2	-1.05800
3	71	Q	G	0.46570	Special	CHO-bEFNB2	-1.42500
4	71	Q	H	0.02003	Positive	CHO-bEFNB2	-0.37640
...	...	...	...	...	...	...	...
19494	601	C	F	NaN	Aromatic	CHO-bEFNB3	-1.66700
19495	601	C	G	NaN	Special	CHO-bEFNB3	-2.04700
19496	601	C	I	NaN	Hydrophobic	CHO-bEFNB3	-0.75770
19497	601	C	P	NaN	Special	CHO-bEFNB3	-1.52300
19498	601	C	V	NaN	Hydrophobic	CHO-bEFNB3	0.01403

	region	site	wildtype	mutant	mutant_type	binding_E2	binding_E3	effect_E2	effect_E3
8545	Stalk	71	Q	P	Special	NaN	NaN	-3.385000	-3.36700
8546	Stalk	71	Q	R	Positive	0.14460	-0.07310	-0.650800	0.06744
8542	Stalk	71	Q	L	Hydrophobic	0.05129	-0.03759	-1.072000	-0.13050
8541	Stalk	71	Q	K	Positive	0.08932	-0.03605	0.005061	-0.25570
8539	Stalk	71	Q	H	Positive	0.02003	-0.06030	-0.376400	-0.05103

	region	site	wildtype	mutant	mutant_type	binding_E2	binding_E3	effect_E2	effect_E3
0	Head	178	V	A	Hydrophobic	0.7066	0.008861	-0.2181	0.01306
1	Head	178	V	C	Special	0.1814	0.451400	0.1203	0.47640
2	Head	178	V	D	Negative	NaN	-0.041930	-1.9200	-1.03800
3	Head	178	V	E	Negative	NaN	0.142800	-1.7900	-0.41900
4	Head	178	V	F	Aromatic	0.5869	0.039550	-0.7901	-0.34260
...	...	...	...	...	...	...	...	...	...
9934	Stalk	147	K	S	Hydrophilic	0.1344	-0.060950	0.1857	0.13650
9935	Stalk	147	K	T	Hydrophilic	1.0700	-0.052750	-0.3402	-0.79560
9936	Stalk	147	K	V	Hydrophobic	NaN	0.086850	-1.9730	-1.02500
9937	Stalk	147	K	W	Aromatic	NaN	NaN	-2.9010	-2.27500
9938	Stalk	147	K	Y	Aromatic	NaN	NaN	-2.9410	-1.39500

interactive_figures.ipynb¶

Make large interactive figures for web viewing using altair with the Nipah RBP DMS data¶

Import modules¶

Set working directory¶

Setup input file paths for running notebook interactively¶

Read config files¶

Import filtered data¶

Merge data and make dataframes for plotting¶

Now assign RBP region to the dataframe¶

Make a site-averaged dataframe of cell entry and binding¶

Make a pivot table for plotting certain data¶

Make plots¶

Make heatmap of correlations between entry in CHO-bEFNB2 and CHO-bEFNB3¶

Make interactive plot linking individual binding and entry effects with top 10 summed binding and entry¶

Now do the same above for for EFNB3¶

Make correlation plots for entry and binding for both efnb2 and efnb3 colored by protein region¶

Make figures showing only binding¶

Make interactive plots of average effects of mutants by site¶

Make interactive chart similar to above but also show individual mutations in heatmaps¶

TESTING. Stuff below work in progress¶

	site	cell_type	effect	binding_mean	wildtype	region
0	71	CHO-bEFNB2	-1.176113	0.043824	Q	Stalk
1	71	CHO-bEFNB3	-0.616870	-0.111348	Q	Stalk
2	72	CHO-bEFNB2	-1.231829	0.079725	N	Stalk
3	72	CHO-bEFNB3	-0.759448	-0.103486	N	Stalk
4	73	CHO-bEFNB2	-0.742614	0.130661	Y	Stalk