interactive_figures.ipynb¶

Make large interactive figures for web viewing using altair with the Nipah RBP DMS data¶

  • Written by Brendan Larsen
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
#input configs
altair_config = None
nipah_config = None

#E2 specific files
func_scores_E2_file = None
binding_E2_file = None
#E3 specific files
func_scores_E3_file = None
binding_E3_file = None

#merged_files
merged_df_file = None
concat_df_file = None

#output plots
output_corr = None
entry_binding_corr_plot_E2_output = None
entry_binding_corr_plot_E3_output = None
corr_entry_binding_large_output = None
combined_binding_output = None
entry_by_site_plot_e2_output = None
entry_by_site_plot_e3_output = None
entry_by_site_plot_e2_bar_plot = None
binding_letter_plot = None
entry_letter_plot = None
entry_letter_plot_slider = None
binding_letter_plot_slider = None
entry_by_site_plot_e3_bar_plot = None
In [2]:
# Parameters
altair_config = "data/custom_analyses_data/interactive_theme.py"
nipah_config = "nipah_config.yaml"
func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
merged_df_file = "results/filtered_data/entry/e2_e3_entry_filter_merged.csv"
concat_df_file = "results/filtered_data/entry/e2_e3_entry_filter_concat.csv"
output_corr = "results/images/corr_heatmap.html"
entry_binding_corr_plot_E2_output = "results/images/entry_binding_corr_plot_E2.html"
entry_binding_corr_plot_E3_output = "results/images/entry_binding_corr_plot_E3.html"
corr_entry_binding_large_output = "results/images/corr_entry_binding_large.html"
combined_binding_output = "results/images/combined_binding.html"
entry_by_site_plot_e2_output = "results/images/entry_by_site_plot_e2.html"
entry_by_site_plot_e3_output = "results/images/entry_by_site_plot_e3.html"
entry_by_site_plot_e2_bar_plot = "results/images/entry_by_site_plot_e2_bar_plot.html"
binding_letter_plot = "results/images/binding_letter_plot.html"
entry_letter_plot = "results/images/entry_letter_plot.html"
entry_letter_plot_slider = "results/images/entry_letter_plot_slider.html"
binding_letter_plot_slider = "results/images/binding_letter_plot_slider.html"
entry_by_site_plot_e3_bar_plot = "results/images/entry_by_site_plot_e3_bar_plot.html"

Import modules¶

In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
import sys

# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

# setup working directory
if os.getcwd() == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/":
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

#import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append('data/custom_analyses_data/')
import interactive_theme
alt.themes.register('interactive_theme', interactive_theme.interactive_theme)
alt.themes.enable('interactive_theme')
Setup in correct directory
Out[3]:
ThemeRegistry.enable('interactive_theme')

Set working directory¶

Setup input file paths for running notebook interactively¶

In [4]:
if nipah_config is None:
    #input files
    #altair_config = 'data/custom_analyses_data/interactive_theme.py'
    nipah_config = 'nipah_config.yaml'
    
    func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
    binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
    
    func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
    binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
    
    antibody_file = 'results/filtered_data/escape/mab_filter_concat.csv'
    merged_df_file = 'results/filtered_data/entry/e2_e3_entry_filter_merged.csv'
    concat_df_file = 'results/filtered_data/entry/e2_e3_entry_filter_concat.csv'

Read config files¶

In [5]:
with open(nipah_config) as f:
    config = yaml.safe_load(f)

Import filtered data¶

In [6]:
merged_df = pd.read_csv(merged_df_file) #merged entry scores 
display(merged_df.head(5))
concat_df = pd.read_csv(concat_df_file) #concat entry scores
display(concat_df.head(5))
site wildtype mutant effect_E2 effect_std_E2 times_seen_E2 n_selections_E2 cell_type_E2 wildtype_site_E2 wt_type_E2 mutant_type_E2 effect_E3 effect_std_E3 times_seen_E3 n_selections_E3 cell_type_E3 wildtype_site_E3 wt_type_E3 mutant_type_E3
0 71 Q C -1.750 0.1777 4.625 8.0 CHO-bEFNB2 Q71 Hydrophilic Special -0.7227 0.7828 3.000 7.0 CHO-bEFNB3 Q71 Hydrophilic Special
1 71 Q D -1.164 0.8890 4.500 8.0 CHO-bEFNB2 Q71 Hydrophilic Negative -0.3884 0.6369 3.429 7.0 CHO-bEFNB3 Q71 Hydrophilic Negative
2 71 Q E -1.255 0.3123 5.375 8.0 CHO-bEFNB2 Q71 Hydrophilic Negative -0.2482 0.9791 4.571 7.0 CHO-bEFNB3 Q71 Hydrophilic Negative
3 71 Q F -1.058 0.6637 4.625 8.0 CHO-bEFNB2 Q71 Hydrophilic Aromatic -0.4973 0.3080 3.286 7.0 CHO-bEFNB3 Q71 Hydrophilic Aromatic
4 71 Q G -1.425 0.5878 7.875 8.0 CHO-bEFNB2 Q71 Hydrophilic Special -1.3310 0.8316 4.714 7.0 CHO-bEFNB3 Q71 Hydrophilic Special
site wildtype mutant effect effect_std times_seen n_selections cell_type wildtype_site wt_type mutant_type
0 71 Q C -1.750 0.1777 4.625 8 CHO-bEFNB2 Q71 Hydrophilic Special
1 71 Q D -1.164 0.8890 4.500 8 CHO-bEFNB2 Q71 Hydrophilic Negative
2 71 Q E -1.255 0.3123 5.375 8 CHO-bEFNB2 Q71 Hydrophilic Negative
3 71 Q F -1.058 0.6637 4.625 8 CHO-bEFNB2 Q71 Hydrophilic Aromatic
4 71 Q G -1.425 0.5878 7.875 8 CHO-bEFNB2 Q71 Hydrophilic Special

Merge data and make dataframes for plotting¶

In [7]:
# Read filtered cell entry data
def read_func_data(file,name):
    effect_df = pd.read_csv(file)
    effect_df = effect_df[['site','wildtype','mutant','effect','mutant_type']]
    effect_df['cell_type'] = name
    return effect_df

# Call func to read in cell entry data
e2_func_df = read_func_data(func_scores_E2_file, 'CHO-bEFNB2')
e3_func_df = read_func_data(func_scores_E3_file, 'CHO-bEFNB3')

# Read filtered binding data
def read_binding_data(file,name):
    binding_df = pd.read_csv(file)
    binding_df = binding_df[['site','wildtype','mutant','binding_mean','mutant_type']]
    binding_df['cell_type'] = name
    return binding_df

# Call func to read in binding data
e2_bind_df = read_binding_data(binding_E2_file,'CHO-bEFNB2')
e3_bind_df = read_binding_data(binding_E3_file,'CHO-bEFNB3')

# Concat binding and func data, then merge
def concat_dfs(bind1,bind2,entry1,entry2):
    combo_bind_df = pd.concat([bind1,bind2])
    combo_entry_df = pd.concat([entry1,entry2])
    total_merged = pd.merge(combo_bind_df,combo_entry_df,on=['site','wildtype','mutant','cell_type','mutant_type'],how='outer')
    return total_merged

final_merged_df = concat_dfs(e2_bind_df,e3_bind_df,e2_func_df,e3_func_df)

### Ok, now I have different inputs ready to go for plotting. Lets review
# I have my different entry dataframes
display(final_merged_df)
site wildtype mutant binding_mean mutant_type cell_type effect
0 71 Q D -0.78170 Negative CHO-bEFNB2 -1.16400
1 71 Q E 0.16590 Negative CHO-bEFNB2 -1.25500
2 71 Q F -0.34290 Aromatic CHO-bEFNB2 -1.05800
3 71 Q G 0.46570 Special CHO-bEFNB2 -1.42500
4 71 Q H 0.02003 Positive CHO-bEFNB2 -0.37640
... ... ... ... ... ... ... ...
19494 601 C F NaN Aromatic CHO-bEFNB3 -1.66700
19495 601 C G NaN Special CHO-bEFNB3 -2.04700
19496 601 C I NaN Hydrophobic CHO-bEFNB3 -0.75770
19497 601 C P NaN Special CHO-bEFNB3 -1.52300
19498 601 C V NaN Hydrophobic CHO-bEFNB3 0.01403

19499 rows × 7 columns

Now assign RBP region to the dataframe¶

In [8]:
def find_domain(df):
    barrel_ranges = {
        "Stalk": list(range(70, 148)),
        "Neck": list(range(148, 166)),
        "Linker": list(range(166, 178)),
        "Head": list(range(178, 603)),
    }
    agg_means = []
    # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = df[df["site"].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append(
                {
                    "site": row["site"],
                    "wildtype": row["wildtype"],
                    "mutant": row["mutant"],
                    "region": barrel,
                    "binding_mean": row["binding_mean"],
                    "effect": row['effect'],
                    "cell_type": row["cell_type"],
                    "mutant_type": row["mutant_type"]
                }
            )
        agg_means_df = pd.DataFrame(agg_means)
    return agg_means_df
# Call function above
binding_entry_concat_df = find_domain(final_merged_df)
display(binding_entry_concat_df)
site wildtype mutant region binding_mean effect cell_type mutant_type
0 71 Q D Stalk -0.78170 -1.16400 CHO-bEFNB2 Negative
1 71 Q E Stalk 0.16590 -1.25500 CHO-bEFNB2 Negative
2 71 Q F Stalk -0.34290 -1.05800 CHO-bEFNB2 Aromatic
3 71 Q G Stalk 0.46570 -1.42500 CHO-bEFNB2 Special
4 71 Q H Stalk 0.02003 -0.37640 CHO-bEFNB2 Positive
... ... ... ... ... ... ... ... ...
19494 601 C F Head NaN -1.66700 CHO-bEFNB3 Aromatic
19495 601 C G Head NaN -2.04700 CHO-bEFNB3 Special
19496 601 C I Head NaN -0.75770 CHO-bEFNB3 Hydrophobic
19497 601 C P Head NaN -1.52300 CHO-bEFNB3 Special
19498 601 C V Head NaN 0.01403 CHO-bEFNB3 Hydrophobic

19499 rows × 8 columns

Make a site-averaged dataframe of cell entry and binding¶

In [9]:
### Make a dataframe with the averaged values of mutants at each site
tmp_df = binding_entry_concat_df.groupby(['site','cell_type'])[['effect','binding_mean']].mean().reset_index()
subset_df = binding_entry_concat_df.drop_duplicates(['site','wildtype','region'])

mean_df = pd.merge(tmp_df,subset_df[['site','wildtype','region']],on='site',how='left')
display(mean_df.head(5))
site cell_type effect binding_mean wildtype region
0 71 CHO-bEFNB2 -1.176113 0.043824 Q Stalk
1 71 CHO-bEFNB3 -0.616870 -0.111348 Q Stalk
2 72 CHO-bEFNB2 -1.231829 0.079725 N Stalk
3 72 CHO-bEFNB3 -0.759448 -0.103486 N Stalk
4 73 CHO-bEFNB2 -0.742614 0.130661 Y Stalk

Make a pivot table for plotting certain data¶

In [10]:
#Now make a pivot table for some graphs
df_pivot = binding_entry_concat_df.pivot_table(index=['region', 'site', 'wildtype','mutant','mutant_type'], 
                          columns='cell_type', 
                          values=['effect', 'binding_mean'],
                          aggfunc='first').reset_index()

# Flatten df
df_pivot.columns = ['_'.join(col).strip() if col[1] else col[0] for col in df_pivot.columns.values]
# rename
df_pivot.rename(columns={
    'effect_CHO-bEFNB2': 'effect_E2',
    'effect_CHO-bEFNB3': 'effect_E3',
    'binding_mean_CHO-bEFNB2': 'binding_E2',
    'binding_mean_CHO-bEFNB3': 'binding_E3'
}, inplace=True)

display(df_pivot.sort_values(by='site').head(5))
region site wildtype mutant mutant_type binding_E2 binding_E3 effect_E2 effect_E3
8545 Stalk 71 Q P Special NaN NaN -3.385000 -3.36700
8546 Stalk 71 Q R Positive 0.14460 -0.07310 -0.650800 0.06744
8542 Stalk 71 Q L Hydrophobic 0.05129 -0.03759 -1.072000 -0.13050
8541 Stalk 71 Q K Positive 0.08932 -0.03605 0.005061 -0.25570
8539 Stalk 71 Q H Positive 0.02003 -0.06030 -0.376400 -0.05103

Make plots¶

Make heatmap of correlations between entry in CHO-bEFNB2 and CHO-bEFNB3¶

In [11]:
def correlation_heatmap(df):
    chart = (
        alt.Chart(df,title=alt.Title('Effects of RBP Mutations on Cell Entry',subtitle='Between CHO cells expressing bat EFNB2 or EFNB3'))
        .mark_rect()
        .encode(
            alt.X("effect_E2", title="RBP mutant entry in CHO-bEFNB2",axis=alt.Axis(values=[-4,-3,-2,-1,0,1])).bin(maxbins=75),
            alt.Y("effect_E3", title="RBP mutant entry in CHO-bEFNB3",axis=alt.Axis(values=[-4,-3,-2,-1,0,1])).bin(maxbins=75),
            alt.Color('count():Q',title='Count').scale(type='log'),
            tooltip=['count()'],
        )
    ).properties(
    height=300,
    width=300,
).configure_legend(
    padding=2,
    orient='top-left', #"left", "right", "top", "bottom", "top-left", "top-right", "bottom-left", "bottom-right", "none"
    labelFontSize=16,
    titlePadding=2,
    symbolSize=100,
)

    return chart
corr_heatmap = correlation_heatmap(merged_df)
corr_heatmap.display()
if entry_by_site_plot_e3_output is not None:
    corr_heatmap.save(output_corr)

Make interactive plot linking individual binding and entry effects with top 10 summed binding and entry¶

In [12]:
def plot_entry_binding_interactive(df,name):
    #find contact sites
    df_copy = df.copy()
    df_copy.loc[:, 'is_contact'] = df_copy['site'].isin(config['contact_sites'])
    
    # make a brush for interactivity
    brush = alt.selection_interval() 
    
    #scatter plot
    chart = alt.Chart(df_copy).mark_point(filled=True,size=50).encode(
            alt.X("effect", title="Cell Entry", axis=alt.Axis(values=[-2,-1,0,1])),
            alt.Y("binding_mean", title="Binding", axis=alt.Axis(values=[-4,-2,0,2])),
            color=alt.condition(brush, 'is_contact', alt.value('lightgray')),  
            tooltip=["site", "wildtype", "mutant", "binding_mean","effect"]  
    ).add_params(
        brush
    ).properties(
        width=400, 
        height=400
    )  
    
    # Create a bar chart showing the sum of binding_median values for the top 10 sites filtered by the selection.
    bars_binding = alt.Chart(df_copy).transform_filter(
        brush  # Apply the selection filter to include only selected data.
    ).transform_aggregate(
        binding_aggr='sum(binding_mean)',  # Aggregate data by summing up binding for selected sites
        groupby=['site', 'is_contact']
    ).transform_window(
        rank='rank(binding_aggr)',  # Rank sites based on the aggregated sum.
        sort=[alt.SortField('binding_aggr', order='descending')]  # Sort by descending order of sum.
    ).transform_filter(
        alt.datum.rank <= 10  # Filter to keep only the top 10 ranked sites.
    ).mark_bar().encode(
        alt.X('site:N', sort='-y', title='Site', axis=alt.Axis(labelAngle=-90)),  # Encode site names on x-axis with custom sorting and label angle.
        alt.Y('binding_aggr:Q', title='Binding'),  # Encode aggregated sum on y-axis.
        color=alt.Color('is_contact', title='Receptor Contact Site')  # Color bars based on whether they are contact sites.
    ).properties(
        width=200, 
        height=50
    )  
    
    # Similar to the bars chart for binding_median, but aggregates and ranks sites based on the 'effect' value.
    bars_effect = alt.Chart(df_copy,title='Top 10').transform_filter(
        brush
    ).transform_aggregate(
        effect_aggr='sum(effect)',
        groupby=['site', 'is_contact']
    ).transform_window(
        rank='rank(effect_aggr)',
        sort=[alt.SortField('effect_aggr', order='descending')]
    ).transform_filter(
        alt.datum.rank <= 10
    ).mark_bar().encode(
        x=alt.X('site:N', sort='-y', title='Site', axis=alt.Axis(labelAngle=-90)),
        y=alt.Y('effect_aggr:Q', title='Entry'),
        color=alt.Color('is_contact', title='Receptor Contact Site')
    ).properties(
        width=200, 
        height=50
    )
    
    # Combine the scatter plot with the two bar charts (stacked vertically and placed side by side).
    combined_chart = chart & (bars_effect | bars_binding)
    combined_chart = combined_chart.properties(
        title={
            "text": f"Correlation of Cell Entry and Binding for {name}", 
            "subtitle": ["Draw box in scatterplot to show the top 10 sites by summed binding or entry"],
        }
    )
    return combined_chart
In [13]:
entry_binding_corr_plot_E2 = plot_entry_binding_interactive(final_merged_df.query('cell_type == "CHO-bEFNB2"'),'bEFNB2')
entry_binding_corr_plot_E2.display()
if entry_by_site_plot_e3_output is not None:
    entry_binding_corr_plot_E2.save(entry_binding_corr_plot_E2_output)

Now do the same above for for EFNB3¶

In [14]:
entry_binding_corr_plot_E3 = plot_entry_binding_interactive(final_merged_df.query('cell_type == "CHO-bEFNB3"'),'bEFNB3')
entry_binding_corr_plot_E3.display()
if entry_by_site_plot_e3_output is not None:
    entry_binding_corr_plot_E3.save(entry_binding_corr_plot_E3_output)

Make correlation plots for entry and binding for both efnb2 and efnb3 colored by protein region¶

In [15]:
display(df_pivot)
region site wildtype mutant mutant_type binding_E2 binding_E3 effect_E2 effect_E3
0 Head 178 V A Hydrophobic 0.7066 0.008861 -0.2181 0.01306
1 Head 178 V C Special 0.1814 0.451400 0.1203 0.47640
2 Head 178 V D Negative NaN -0.041930 -1.9200 -1.03800
3 Head 178 V E Negative NaN 0.142800 -1.7900 -0.41900
4 Head 178 V F Aromatic 0.5869 0.039550 -0.7901 -0.34260
... ... ... ... ... ... ... ... ... ...
9934 Stalk 147 K S Hydrophilic 0.1344 -0.060950 0.1857 0.13650
9935 Stalk 147 K T Hydrophilic 1.0700 -0.052750 -0.3402 -0.79560
9936 Stalk 147 K V Hydrophobic NaN 0.086850 -1.9730 -1.02500
9937 Stalk 147 K W Aromatic NaN NaN -2.9010 -2.27500
9938 Stalk 147 K Y Aromatic NaN NaN -2.9410 -1.39500

9939 rows × 9 columns

In [16]:
def correlation_plot(df):
    df = df.round(2)
    
    #setup interactivity
    variant_selector = alt.selection_point(
        on="mouseover", empty=False, nearest=True, fields=["site","mutant"], value=1
    )
    options = ['Stalk', 'Neck', 'Linker','Head']
    labels = [option + ' ' for option in options]
    
    input_dropdown = alt.binding_radio(
        options=options + [None],
        labels=labels + ['All'],
        name='Region: '
    )
    selection = alt.selection_point(
        fields=['region'],
        bind=input_dropdown,
    )    
    color = alt.condition(
        selection,
        alt.Color('region:N',scale=alt.Scale(domain=options),title='Region'),
        alt.value('lightgray'),
    )
    opacity = alt.condition(
        selection,
        alt.value(1),
        alt.value(0.5)
    )
    
    #make effect chart
    effect_chart = (
        alt.Chart(df,title='Cell entry')
        .mark_point(size=30,opacity=1,filled=True)
        .encode(
            alt.X("effect_E2", title="Entry in CHO-bEFNB2",axis=alt.Axis(tickCount=4)),
            alt.Y("effect_E3", title="Entry in CHO-bEFNB3",axis=alt.Axis(tickCount=4)),
            tooltip=['wildtype',"site", "mutant",'effect_E2','effect_E3'],
            opacity=opacity,
            color=color,
        )
    ).properties(
        height=300,
        width=300
    )
    #make binding chart
    binding_chart = (
        alt.Chart(df,title='Receptor Binding')
        .mark_point(size=30,opacity=1,filled=True)
        .encode(
            alt.X("binding_E2", title="bEFNB2 Binding",axis=alt.Axis(tickCount=4)),
            alt.Y("binding_E3", title="bEFNB3 Binding",axis=alt.Axis(tickCount=4)),
            tooltip=['wildtype',"site", "mutant",'effect_E2','effect_E3','binding_E2','binding_E3'],
            color=color,
            opacity=opacity,
        )
    ).properties(
        height=300,
        width=300
    )
    #combine charts
    combined_chart = effect_chart | binding_chart
    combined_chart=combined_chart.add_params(selection).properties(
        title=alt.Title('Correlations of RBP Mutations Between bEFNB2 and bEFNB3 for Entry and Binding', offset=30,
                        subtitle=['Select radio button to see mutants highlighted or hover over points to see more information'])
    )
    return combined_chart

# Call function above and save
corr_entry_binding_large = correlation_plot(df_pivot)
corr_entry_binding_large.display()
if entry_by_site_plot_e3_output is not None:
    corr_entry_binding_large.save(corr_entry_binding_large_output)

Make figures showing only binding¶

In [17]:
def make_custom_binding_figure(df,name):
    brush = alt.selection_interval() #define selection brush
    custom_order = ["Stalk", "Neck", "Linker", "Head"]

    chart = alt.Chart(df,title=alt.Title(f'{name}')).mark_point(opacity=0.3,filled=True).encode(
            alt.X(
                "binding_mean",
                title=f"Binding",
                axis=alt.Axis(tickCount=4),
            ),
            alt.Y(
                "region:O",
                sort=custom_order,
                title="RBP Region",
            ),
            
            yOffset="random:Q",
            tooltip=["region", "binding_mean", "site", "mutant"],
            color=alt.condition(brush, 'region', alt.value('lightgray')),
        ).transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())").properties(height=200,width=300).add_params(brush)
    
    bars = alt.Chart(df).transform_filter(
            brush
        ).transform_aggregate(
            binding_aggr='sum(binding_mean)',
            groupby=['site', 'region']
        ).transform_window(
            rank='rank(binding_aggr)',
            sort=[alt.SortField('binding_aggr', order='descending')]
        ).transform_filter(
            alt.datum.rank <= 10
        ).mark_bar().encode(
            y=alt.Y('binding_aggr:Q',title='Binding'),
            x=alt.X('site:N', sort='-y',title='Site'),
            color=alt.Color('region',title='Region')
        ).properties(height=50,width=300)
    combined_chart = chart & bars 
    combined_chart
    return combined_chart
In [18]:
#make individual bEFNB2 plot
efnb2_binding_region = make_custom_binding_figure(binding_entry_concat_df.query('cell_type == "CHO-bEFNB2"'),'bEFNB2')

#make individual bEFNB3 plot
efnb3_binding_region = make_custom_binding_figure(binding_entry_concat_df.query('cell_type == "CHO-bEFNB3"'),'bEFNB3')

# combine the plots
combined_binding = (efnb2_binding_region | efnb3_binding_region).properties(
    title=alt.Title('Effects of RBP Mutations on Receptor Binding', offset=30,
    subtitle='Draw boxes on scatter plots to see the top sites by summed binding')
)

combined_binding.display()
if entry_by_site_plot_e3_output is not None:
    combined_binding.save(combined_binding_output)

Make interactive plots of average effects of mutants by site¶

In [19]:
def entry_by_site(df):
    df = df.round(2)
    custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend

    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=["site"], value=1
    )
    
    chart = (
        alt.Chart(df)
        .mark_bar(opacity=1,stroke='black')
        .encode(
            alt.X("site:N", title='Site',axis=alt.Axis(labelAngle=-90,values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
            alt.Y("effect", title="Mean entry"),
            tooltip=['wildtype',"site", "effect","region"],
            color=alt.Color('region',sort=custom_order,title='Region'),
            strokeWidth=alt.condition(
                    variant_selector, alt.value(1), alt.value(0)
                ),
            row=alt.Row('cell_type',title=None,header=alt.Header(labelFontSize=16,labelFontWeight='bold'))
        )
    ).properties(
        width=800,
        height=150
    )
    combined_chart = chart.properties(
        title=alt.Title('Average Cell Entry of Mutants at Each Site',
        subtitle=['Hover mouse over bars to view information about cell entry'])
    ).add_params(variant_selector)
    
    return combined_chart

entry_by_site_plot = entry_by_site(mean_df)
entry_by_site_plot.display()
#entry_by_site_plot.save('results/images/entry_by_site_plot.html')
In [20]:
def binding_by_site(df):
    #Site 500 is filtered out so I need to add a dummy value so it shows up on axis:
    data = {
        'site': [500],
        'cell_type': ['CHO-bEFNB2'],
        'effect': [0],
        'binding_mean': [0],
        'wildtype': [None],  # Use None for missing values, or '' for an empty string
        'region': ['Head']
    }
    # Create DataFrame
    dummy_data = pd.DataFrame(data)
    df = pd.concat([df, dummy_data])

    
    df = df.round(2)
    custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend

    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=["site"], value=1
    )
    
    chart = (
        alt.Chart(df)
        .mark_bar(opacity=1,stroke='black')
        .encode(
            alt.X("site:N", title='Site',axis=alt.Axis(labelAngle=-90,values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
            alt.Y("binding_mean", title="Mean binding"),
            tooltip=['wildtype',"site", "effect","region"],
            color=alt.Color('region',sort=custom_order,title='Region'),
            strokeWidth=alt.condition(
                    variant_selector, alt.value(1), alt.value(0)
                ),
            row=alt.Row('cell_type',title=None,header=alt.Header(labelFontSize=16,labelFontWeight='bold'))
        )
    ).properties(
        width=800,
        height=150
    )
    combined_chart = chart.properties(
        title=alt.Title('Average Receptor Binding of Mutants at Each Site',
        subtitle=['Hover mouse over bars to view information about cell entry'])
    ).add_params(variant_selector).resolve_scale(y='independent')
    
    return combined_chart
In [21]:
#call function above
entry_by_site_plot = binding_by_site(mean_df)
entry_by_site_plot.display()
#entry_by_site_plot.save('results/images/entry_by_site_plot.html')

Make interactive chart similar to above but also show individual mutations in heatmaps¶

In [22]:
def entry_by_site(df,name,effect):
    # make an empty dataframe with every possible mutation so empty values still get plotted
    amino_acid_order = ["R","K","H","D","E","Q","N","S","T","Y","W","F","A","I","L","M","V","G","P","C"]
    sites = range(71, 603)
    data = [{"site": site, "mutant": aa} for site in sites for aa in amino_acid_order]
    empty_df = pd.DataFrame(data)
    full_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
 
    #setup interactive features
    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=["site"], value=1
    )
    
    #make base chart
    base = alt.Chart(full_df).add_params(variant_selector)
    #add bar chart of cell entry by site
    chart = base.mark_bar(opacity=1,stroke='black').encode(
            alt.X("site:N", title='Site',axis=alt.Axis(values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
            alt.Y(f"mean({effect})", title="Mean entry"),
            tooltip=["site", "wildtype","region"],
            opacity=alt.condition(variant_selector,alt.value(1),alt.value(0.7)),
            strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
            color=alt.Color('region',title='Region')
    ).properties(
        width=800,
        height=200
    )

    #add heatmap below showing effect of mutants on cell entry
    select_bar = alt.Chart(full_df).mark_bar(stroke='black').encode(
        alt.X('mutant:N',title=None,scale=alt.Scale(domain=amino_acid_order)),
        color=alt.Color(f'{effect}',legend=alt.Legend(orient='right',direction='horizontal',titleAlign='center',titleAnchor='middle'),title='Cell entry',scale=alt.Scale(scheme='redblue',domainMid=0,domain=[-4,2]))
    ).transform_filter(
        variant_selector
    ).properties(
        width=400,
        height=10
    )

    #make heatmap have an x for wildtype residue
    select_bar_wildtype = alt.Chart(full_df).mark_text(color="black", text="X", size=10, align="center", baseline="middle").encode(
        alt.X('wildtype:N',title='Amino acid'),
    ).transform_filter(
        variant_selector
    ).transform_filter(
        (alt.datum[effect] != None) #filter out empty data
    ).properties(
        width=400,
        height=10
    )
    # combine heatmap plots
    combined_effects_w_wildtype = alt.layer(select_bar_wildtype,select_bar).resolve_scale(x='shared')

    # combine the bar and heatmaps
    combined_chart = alt.vconcat(chart,combined_effects_w_wildtype).resolve_scale(y='independent',x='independent')
    combined_chart = combined_chart.properties(
        title=alt.Title(f'Entry in {name}', offset=30,
        subtitle=['Hover over sites to see information about entry of specific mutations'])
    )
    return combined_chart
In [23]:
# call chart function
entry_by_site_plot_e2 = entry_by_site(df_pivot,'CHO-bEFNB2','effect_E2')
entry_by_site_plot_e2.display()
if entry_by_site_plot_e3_output is not None:
    entry_by_site_plot_e2.save(entry_by_site_plot_e2_output)
In [24]:
# call chart function
entry_by_site_plot_e3 = entry_by_site(df_pivot,'CHO-bEFNB3','effect_E3')
entry_by_site_plot_e3.display()
if entry_by_site_plot_e3_output is not None:
    entry_by_site_plot_e3.save(entry_by_site_plot_e3_output)

TESTING. Stuff below work in progress¶

In [25]:
def entry_by_site_test(df,name,effect):
    # make an empty dataframe with every possible mutation so empty values still get plotted
    amino_acid_order = ["R","K","H","D","E","Q","N","S","T","Y","W","F","A","I","L","M","V","G","P","C"]
    sites = range(71, 603)
    data = [{"site": site, "mutant": aa} for site in sites for aa in amino_acid_order]
    empty_df = pd.DataFrame(data)
    full_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
 
    #setup interactive features
    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=["site"], value=1
    )
    
    #make base chart
    base = alt.Chart(full_df).add_params(variant_selector)
    #add bar chart of cell entry by site
    chart = base.mark_bar(opacity=1,stroke='black').encode(
            alt.X("site:N", title='Site',axis=alt.Axis(values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
            alt.Y(f"mean({effect})", title="Mean cell entry"),
            tooltip=["site", "wildtype","region"],
            opacity=alt.condition(variant_selector,alt.value(1),alt.value(1)),
            strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
            color=alt.Color('region',title='Region')
    ).properties(
        width=800,
        height=150
    )

    select_bar = alt.Chart(full_df).mark_bar(stroke='black', color='darkgray').encode(
        alt.X('mutant:N', title=None, scale=alt.Scale(domain=amino_acid_order)),  # Removed sort='-y' here
        alt.Y(f'{effect}', scale=alt.Scale(domain=[-4, 2]), title='Cell entry'),
        alt.Color(f'{effect}',title='Effect', legend=None,scale=alt.Scale(scheme='redblue',domainMid=0,domain=[-4,2])),
    ).transform_filter(
        variant_selector
    ).transform_window(
        rank='rank()',
        sort=[alt.SortField(f'{effect}', order='descending')]  # Sort based on the effect values in descending order
    ).properties(
        width=400,
        height=150
    )

    #make heatmap have an x for wildtype residue
    select_bar_wildtype = alt.Chart(full_df).mark_text(color="black", text="X", size=14, align="center", baseline="middle",dy=-20).encode(
        alt.X('wildtype:N',title='Amino acid'),
    ).transform_filter(
        variant_selector
    ).transform_filter(
        (alt.datum[effect] != None) #filter out empty data
    ).properties(
        width=400,
        height=150
    )
    
    #combine heatmap plots
    combined_effects_w_wildtype = alt.layer(select_bar_wildtype,select_bar).resolve_scale(x='shared')

    #combine the bar and heatmaps
    combined_chart = alt.vconcat(chart,combined_effects_w_wildtype).resolve_scale(y='independent',x='independent')
    combined_chart = combined_chart.properties(
        title=alt.Title(f'Cell Entry in {name}', offset=30,
        subtitle=['Effects of RBP mutations on cell entry','Hover over sites to see information about specific mutations'])
    )
    return combined_chart

# Call functions to plot
entry_by_site_plot_e2_bar = entry_by_site_test(df_pivot,'CHO-bEFNB2','effect_E2')
entry_by_site_plot_e3_bar = entry_by_site_test(df_pivot,'CHO-bEFNB3','effect_E3')

# Visualize plots
entry_by_site_plot_e2_bar.display()
entry_by_site_plot_e3_bar.display()

#Save plots if papermill is running
if entry_by_site_plot_e3_output is not None:
    entry_by_site_plot_e2_bar.save(entry_by_site_plot_e2_bar_plot)
    entry_by_site_plot_e3_bar.save(entry_by_site_plot_e3_bar_plot)
In [26]:
def make_effect_by_site_with_hover_tooltip(df):
    tmp_df = df.groupby(['cell_type','site'])['effect'].mean().reset_index().round(2)
    
    # Create a selection that chooses the nearest point & selects based on x-value
    nearest = alt.selection_point(nearest=True, on='mouseover',
                            fields=['site'], empty=False)
    
    # The basic line
    line = alt.Chart(tmp_df).mark_line(interpolate='basis',size=1).encode(
        alt.X('site:Q', title='Site',axis=alt.Axis(values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600])),
        alt.Y('effect:Q',title='Mean entry'),
        color=alt.Color('cell_type:N',title='Cell type')
    )
    
    # Transparent selectors across the chart. This is what tells us
    # the x-value of the cursor
    selectors = alt.Chart(tmp_df).mark_point().encode(
        alt.X('site:Q'),
        opacity=alt.value(0),
    ).add_params(
        nearest
    )
    # Draw points on the line, and highlight based on selection
    points = line.mark_point().encode(
        opacity=alt.condition(nearest, alt.value(1), alt.value(0))
    )
    
    # Draw text labels near the points, and highlight based on selection
    text = line.mark_text(align='left', dx=5, dy=-5,fontSize=15).encode(
        text=alt.condition(nearest, 'effect:Q', alt.value(' ')),
        #color=alt.value('black')
    )#.transform_filter(alt.datum.cell_type == 'CHO-EFNB2')
    
    # Draw a rule at the location of the selection
    rules = alt.Chart(tmp_df).mark_rule(color='gray').encode(
        x='site:Q',
    ).transform_filter(
        nearest
    )
    # Put the five layers into a chart and bind the data
    combined_chart = alt.layer(
        line, selectors, points, rules, text
    ).properties(
        width=800, height=200
    )
    return combined_chart

alt_plot = make_effect_by_site_with_hover_tooltip(binding_entry_concat_df)
alt_plot.display()
In [27]:
def plot_affinity_individual_mutants(df):
    df = df.round(2).copy()

    #setup interactive features
    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=['site'], value=1
    )

    selector = alt.selection_point(
        name="SelectorName",
        fields=['site'],
        bind=alt.binding_range(min=185,max=602,step=1,name='Site'),
        value=[{'site': 492}]
    )

    show_site_mutant = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=['site','mutant'], value=1
    )
    
    # make left side
    all_sites_scatter = (alt.Chart(df)
        .mark_point(filled=True,stroke='black',size=50)
        .encode(
            alt.X("binding_E2", title=("bEFNB2 binding"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4.5,2])),
            alt.Y("binding_E3", title=("bEFNB3 binding"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-2.5,2])),
            tooltip=['site','wildtype','binding_E2','binding_E3','effect_E2','effect_E3'],
            opacity=alt.condition(variant_selector,alt.value(1),alt.value(0.2)),
            color=alt.condition(variant_selector,alt.value('#af7aa1'),alt.value('gray')),
            strokeWidth=alt.condition(variant_selector,alt.value(1),alt.value(0))
        )
        .add_params(variant_selector)
        .properties(
            height=300,
            width=300
        )
    )

    # make amino acid letters
    chart = (alt.Chart(df)
        .mark_text(size=20)
        .encode(
            alt.X("binding_E2", title=("bEFNB2 binding"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4.5,2])),
            alt.Y("binding_E3", title=("bEFNB3 binding"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-2.5,2])),
            alt.Text('mutant'),
            alt.Color('mutant_type',title='Mutant type',scale=alt.Scale(
                    domain=['Aromatic', 'Hydrophilic', 'Hydrophobic','Negative', 'Positive', 'Special'],
                    range=["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949"])),
            tooltip=['site','wildtype','binding_E2','binding_E3','effect_E2','effect_E3'], 
            
        )
        .add_params(variant_selector)
        .transform_filter(variant_selector)
        .properties(
            height=300,
            width=300
        )
    )
    # Vertical line at x=0
    vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(x='x:Q')
    # Horizontal line at y=0
    hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(y='y:Q')
    # Combine the amino acid letter plot with the vertical and horizontal lines
    final_chart = vline + hline + chart
    final_scatter = vline + hline + all_sites_scatter 
    combined = alt.hconcat(final_scatter,final_chart,spacing=20).properties(title=alt.Title('Effects of RBP mutations on binding to bEFNB2 and bEFNB3',subtitle='Hover over points to see individual amino acids'))
    return combined

test_plot = plot_affinity_individual_mutants(df_pivot)
test_plot.display()
if entry_by_site_plot_e3_output is not None:
    test_plot.save(binding_letter_plot)
In [28]:
def plot_affinity_individual_mutants(df):
    df = df.round(2).copy()

    #setup interactive features
    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=['site'], value=1
    )

    selector = alt.selection_point(
        name="SelectorName",
        fields=['site'],
        bind=alt.binding_range(min=185,max=602,step=1,name='Site'),
        value=[{'site': 492}]
    )
    
    # make left side
    all_sites_scatter = (alt.Chart(df)
        .mark_point(filled=True,stroke='black',size=50)
        .encode(
            alt.X("effect_E2", title=("Entry in CHO-bEFNB2"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4,1])),
            alt.Y("effect_E3", title=("Entry in CHO-bEFNB3"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4,1])),
            tooltip=['site','wildtype','binding_E2','binding_E3','effect_E2','effect_E3'],
            opacity=alt.condition(variant_selector,alt.value(1),alt.value(0.2)),
            color=alt.condition(variant_selector,alt.value('#af7aa1'),alt.value('gray')),
            strokeWidth=alt.condition(variant_selector,alt.value(1),alt.value(0))
        )
        .add_params(variant_selector)
        .properties(
            height=300,
            width=300
        )
    )

    # make amino acid letters
    chart = (alt.Chart(df)
        .mark_text(size=20)
        .encode(
            alt.X("effect_E2", title=("Entry in CHO-bEFNB2"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4,1])),
            alt.Y("effect_E3", title=("Entry in CHO-bEFNB3"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4,1])),
            alt.Text('mutant'),
            alt.Color('mutant_type',title='Mutant type',scale=alt.Scale(
                    domain=['Aromatic', 'Hydrophilic', 'Hydrophobic','Negative', 'Positive', 'Special'],
                    range=["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949"])),
            tooltip=['site','wildtype','binding_E2','binding_E3','effect_E2','effect_E3'],  
        )
        .add_params(variant_selector)
        .transform_filter(variant_selector)
        .properties(
            height=300,
            width=300
        )
    )
    # Vertical line at x=0
    vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(x='x:Q')
    # Horizontal line at y=0
    hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=1,strokeDash=[2,4]).encode(y='y:Q')
    # Combine the amino acid letter plot with the vertical and horizontal lines
    final_chart = vline + hline + chart
    final_scatter = vline + hline + all_sites_scatter 
    combined = alt.hconcat(final_scatter,final_chart,spacing=20).properties(title=alt.Title('Effects of RBP mutations on cell entry',subtitle='Hover over points to see individual amino acids'))
    return combined

test_plot = plot_affinity_individual_mutants(df_pivot)
test_plot.display()
if entry_by_site_plot_e3_output is not None:
    test_plot.save(entry_letter_plot)
In [29]:
def plot_entry_slider(df):
    df = df.round(2).copy()

    #setup interactive features
    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=['site'], value=1
    )

    selector = alt.selection_point(
        name="SelectorName",
        fields=['site'],
        bind=alt.binding_range(min=71,max=602,step=1,name='Site '),
        value=[{'site': 71}]
    )

    # make amino acid letters
    chart = (alt.Chart(df)
        .mark_text(size=20)
        .encode(
            alt.X("effect_E2", title=("Entry in CHO-bEFNB2"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4,1])),
            alt.Y("effect_E3", title=("Entry in CHO-bEFNB3"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4,1])),
            alt.Text('mutant'),
            alt.Color('mutant_type',title='Mutant type',scale=alt.Scale(
                    domain=['Aromatic', 'Hydrophilic', 'Hydrophobic','Negative', 'Positive', 'Special'],
                    range=["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949"])),
            tooltip=['site','wildtype','effect_E2','effect_E3'],  
        )
        .add_params(variant_selector,selector)
        .transform_filter(selector)
        .properties(
            height=300,
            width=300,
            title=alt.Title('Effects of RBP mutations on cell entry',offset=30,subtitle='Use slider to see individual mutations at each site')
        )
    )
    # Vertical line at x=0
    vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=0.5,strokeDash=[2,4]).encode(x='x:Q')
    # Horizontal line at y=0
    hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=0.5,strokeDash=[2,4]).encode(y='y:Q')
    # Combine the amino acid letter plot with the vertical and horizontal lines
    final_chart = vline + hline + chart
    return final_chart

entry_slider_plot = plot_entry_slider(df_pivot)
entry_slider_plot.display()
if entry_by_site_plot_e3_output is not None:
    entry_slider_plot.save(entry_letter_plot_slider)
In [30]:
def plot_affinity_slider(df):
    df = df.round(2).copy()

    #setup interactive features
    variant_selector = alt.selection_point(
        on="mouseover", empty=False,nearest=True, fields=['site'], value=1
    )
    selector = alt.selection_point(
        name="SelectorName",
        fields=['site'],
        bind=alt.binding_range(min=71,max=602,step=1,name='Site '),
        value=[{'site': 71}]
    )

    # make amino acid letters
    chart = (alt.Chart(df)
        .mark_text(size=20)
        .encode(
            alt.X("binding_E2", title=("bEFNB2 binding"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-5,2])),
            alt.Y("binding_E3", title=("bEFNB3 binding"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-2.25,2])),
            alt.Text('mutant'),
            alt.Color('mutant_type',title='Mutant type',scale=alt.Scale(
                    domain=['Aromatic', 'Hydrophilic', 'Hydrophobic','Negative', 'Positive', 'Special'],
                    range=["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949"])),
            tooltip=['site','wildtype','binding_E2','binding_E3'],  
        )
        .add_params(variant_selector,selector)
        .transform_filter(selector)
        .properties(
            height=300,
            width=300,
            title=alt.Title('Effects of RBP mutations on receptor binding',offset=30,subtitle='Use slider to see individual mutations at each site')
        )
    )
    # Vertical line at x=0
    vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=0.5,strokeDash=[2,4]).encode(x='x:Q')
    # Horizontal line at y=0
    hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=0.5,strokeDash=[2,4]).encode(y='y:Q')
    # Combine the amino acid letter plot with the vertical and horizontal lines
    final_chart = vline + hline + chart
    return final_chart

binding_slider_plot = plot_affinity_slider(df_pivot)
binding_slider_plot.display()
if entry_by_site_plot_e3_output is not None:
    binding_slider_plot.save(binding_letter_plot_slider)
In [ ]: