nipah_RBP_entry_analysis.ipynb¶

This notebook pulls in data on Nipah RBP entry into CHO-bEFNB2 and bCHO-EFNB3 cells, calculates stats, and makes figures¶

  • Written by Brendan Larsen
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
nipah_config = None
altair_config = None
surface = None

#input files
e2_distances_file = None
func_scores_E2_file = None
func_scores_E3_file = None
concat_df_file = None
merged_df_file = None

#output files
E2_E3_entry_corr_plot = None
E2_E3_entry_all_muts_plot = None
entry_region_boxplot_plot = None
combined_region_barplot_output = None
In [2]:
# Parameters
func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
e2_distances_file = "results/distances/2vsm_distances.csv"
merged_df_file = "results/filtered_data/entry/e2_e3_entry_filter_merged.csv"
concat_df_file = "results/filtered_data/entry/e2_e3_entry_filter_concat.csv"
E2_E3_entry_corr_plot = "results/images/E2_E3_entry_corr_plot.html"
E2_E3_entry_all_muts_plot = "results/images/E2_E3_entry_all_muts_plot.html"
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
entry_region_boxplot_plot = "results/images/entry_region_boxplot_plot.html"
surface = "data/custom_analyses_data/surface_exposure.csv"
combined_region_barplot_output = "results/images/combined_region_barplot_output.html"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
import sys

# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

# setup working directory
if os.getcwd() == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/":
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

#import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append('data/custom_analyses_data/')
import theme
alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')
Setup in correct directory
Out[3]:
ThemeRegistry.enable('main_theme')

Paths for running notebook interactively¶

In [4]:
if nipah_config is None:
    e2_distances_file = "results/distances/2vsm_distances.csv"
    func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
    func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
    
    concat_df_file = "results/filtered_data/entry/e2_e3_entry_filter_concat.csv"
    merged_df_file = "results/filtered_data/entry/e2_e3_entry_filter_merged.csv"
    
    nipah_config = "nipah_config.yaml"
    #altair_config = "data/custom_analyses_data/theme.py"
    
    surface = "data/custom_analyses_data/surface_exposure.csv"

Read in custom altair theme and import YAML file with parameters¶

In [5]:
with open(nipah_config) as f:
    config = yaml.safe_load(f)

Import filtered data¶

In [6]:
#Import filtered entry scores from different selections
func_scores_E2 = pd.read_csv(func_scores_E2_file)
func_scores_E3 = pd.read_csv(func_scores_E3_file)
concat_df = pd.read_csv(concat_df_file) #concatanated entry scores
merged_df = pd.read_csv(merged_df_file) #merged entry scores
e2_distances = pd.read_csv(e2_distances_file)

Calculate some basic stats about E2 and E3 entry scores¶

In [7]:
def calculate_stats(df, name):
    print(f"For {name}:")
    total_mut = (532) * 19 #total number of possible amino acids
    print(f'There are {total_mut} amino acid mutations possible')
    muts_present = df["effect"].shape[0]
    fraction_muts = muts_present / total_mut
    print(
        f"fraction muts present in {name} is {fraction_muts:.2f} {muts_present}/{total_mut}"
    )

    # Break mutations into bins (negative, neutral, positive) and calculate how many mutants there are
    deleterious_muts = df[df["effect"] <= -0.5].shape[0] 
    neutral_muts = df[(df["effect"] > -0.5) & (df["effect"] < 0.5)].shape[0]
    positive_muts = df[df["effect"] > 0.5].shape[0]

    frac_bad_muts = deleterious_muts / muts_present
    frac_neutral_muts = neutral_muts / muts_present
    frac_pos_muts = positive_muts / muts_present
    print(
        f"The number of deleterious mutants for {name} is {frac_bad_muts:.2f} {deleterious_muts}/{muts_present}"
    )
    print(
        f"The number of neutral mutants for {name} is {frac_neutral_muts:.2f} {neutral_muts}/{muts_present}"
    )
    print(
        f"The number of positive mutants for {name} is {frac_pos_muts:.2f} {positive_muts}/{muts_present}\n"
    )

calculate_stats(func_scores_E2, "CHO-bEFNB2")
calculate_stats(func_scores_E3, "CHO-bEFNB3")
For CHO-bEFNB2:
There are 10108 amino acid mutations possible
fraction muts present in CHO-bEFNB2 is 0.97 9786/10108
The number of deleterious mutants for CHO-bEFNB2 is 0.44 4296/9786
The number of neutral mutants for CHO-bEFNB2 is 0.54 5303/9786
The number of positive mutants for CHO-bEFNB2 is 0.02 186/9786

For CHO-bEFNB3:
There are 10108 amino acid mutations possible
fraction muts present in CHO-bEFNB3 is 0.96 9713/10108
The number of deleterious mutants for CHO-bEFNB3 is 0.43 4182/9713
The number of neutral mutants for CHO-bEFNB3 is 0.56 5451/9713
The number of positive mutants for CHO-bEFNB3 is 0.01 80/9713

Find which sites only have negative entry scores for all mutants¶

In [8]:
def overall_stats_all_neg(df,effect,name,region=None):
    print(f'We are analyzing data for {name}:')
    
    if region is None:
        contact_df = df
    else:
        contact_df = df[df['site'].isin(region)]
    
    filtered_df = contact_df.groupby('site').filter(lambda group: (group['effect'] < -2.25).all())
    unique = filtered_df['site'].unique()
    print(list(unique))
    total_sites = contact_df['site'].unique().shape[0]
    subset = filtered_df['site'].unique().shape[0]
       
    fraction = subset/total_sites
    percent = fraction * 100
    print(f'The total number of sites are: {total_sites}\n')
    print(f' The number of sites where all mutants are negative for {effect}: {subset}\n')
    print(f' The percent of sites where all mutants are negative for {effect}: {percent:.0f}\n')
    return unique

intolerant_sites_E2 = list(overall_stats_all_neg(func_scores_E2,'effect','CHO-bEFNB2'))
intolerant_sites_E3 = list(overall_stats_all_neg(func_scores_E3,'effect','CHO-bEFNB3',region=config['contact_sites']))
We are analyzing data for CHO-bEFNB2:
[107, 121, 162, 263, 382, 387, 489, 499, 565]
The total number of sites are: 532

 The number of sites where all mutants are negative for effect: 9

 The percent of sites where all mutants are negative for effect: 2

We are analyzing data for CHO-bEFNB3:
[489, 506, 533, 557, 579, 581]
The total number of sites are: 33

 The number of sites where all mutants are negative for effect: 6

 The percent of sites where all mutants are negative for effect: 18

Plot correlations between E2 and E3 entry¶

In [9]:
# merge distance data
distance_df = pd.merge(
    merged_df, e2_distances[["site", "distance"]], on="site", how="left"
)

def determine_distance(df):
    # Define the conditions
    conditions = [
        df["distance"] < 4,
        (df["distance"] >= 4) & (df["distance"] <= 8),
        df["distance"] > 8,
    ]

    # Define the associated values for the conditions
    choices = ["contact", "close", "distant"]

    # Apply the conditions and choices to the 'E2_contact' column
    df["contact"] = np.select(conditions, choices, default="distant")
    return df


distance_df = determine_distance(distance_df)
In [10]:
def mean_correlation_plot(df, metric):
    means = df.groupby('site')[["effect_E2", "effect_E3"]].mean().reset_index().round(2)
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        means["effect_E2"], means["effect_E3"]
    )

    contact_sites = df[["site", "contact", "wildtype"]].drop_duplicates()
    df_mean = pd.merge(means, contact_sites, on="site", how="left")
    display(df_mean)

    chart = alt.Chart(df_mean).mark_point(filled=True,size=50,stroke='black',strokeWidth=1,opacity=1).encode(
        alt.X("effect_E2", title="Mean entry in CHO-bEFNB2",axis=alt.Axis(tickCount=4)),
        alt.Y("effect_E3", title="Mean entry in CHO-bEFNB3",axis=alt.Axis(tickCount=4)),
        tooltip=["wildtype","site","effect_E2",'effect_E3','contact'],
        color=alt.Color(
            "contact",
            scale=alt.Scale(
                domain=["contact", "close", "distant"],
                range=["#4e79a7", "#f28e2c", "gray"],
            ),
            legend=alt.Legend(title="Receptor distance"),
        ),
    ).properties(width=300,height=300)

    #Add r value to chart
    min_ = int(df_mean["effect_E2"].min())
    max_ = int(df_mean["effect_E3"].max())
    text = alt.Chart({"values": [{"x": min_, "y": max_, "text": f"r = {r_value:.2f}"}]}).mark_text(
        align="left",
        baseline="top",
        dx=-20,  # Adjust this for position
        dy=-20,  # Adjust this for position
    ).encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
    
    plot = chart + text
    return plot


E2_E3_plot = mean_correlation_plot(distance_df, "mean")
E2_E3_plot.display()
if entry_region_boxplot_plot is not None:
    E2_E3_plot.save(E2_E3_entry_corr_plot)
site effect_E2 effect_E3 contact wildtype
0 71 -1.18 -0.62 distant Q
1 72 -1.23 -0.76 distant N
2 73 -0.74 -0.46 distant Y
3 74 -0.68 -0.44 distant T
4 75 -0.73 -0.29 distant R
... ... ... ... ... ...
527 598 -1.04 -0.82 distant P
528 599 0.01 0.12 distant E
529 600 0.07 0.18 distant Q
530 601 -1.06 -0.77 distant C
531 602 0.20 0.28 distant T

532 rows × 5 columns

In [11]:
def correlation_plot(df):
    df = df.dropna()
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        df["effect_E2"], df["effect_E3"]
    )
    # setup interactive
    variant_selector = alt.selection_point(
        on="mouseover", empty=False, nearest=True, fields=["contact"], value=1
    )
    #plotting
    chart = (
        alt.Chart(df)
        .mark_point(size=20,filled=True)
        .encode(
            x=alt.X("effect_E2", title="Entry in CHO-bEFNB2",axis=alt.Axis(tickCount=4)),
            y=alt.Y("effect_E3", title="Entry in CHO-bEFNB3",axis=alt.Axis(tickCount=4)),
            tooltip=["site", "wildtype", "mutant"],
            color=alt.Color(
                "contact",
                scale=alt.Scale(
                    domain=["contact", "close", "distant"],
                    range=["#4e79a7", "#f28e2c", "gray"],
                ),
                legend=alt.Legend(title="Receptor distance"),
            ),
            opacity=alt.condition(variant_selector,alt.value(1),alt.value(0.2)),
        ).add_params(variant_selector)
    ).properties(height=300,width=300)
    min_ = int(df['effect_E2'].min())
    max_ = int(df['effect_E3'].max())
    
    text = (
        alt.Chart({"values": [{"x": min_, "y": max_, "text": f"r = {r_value:.2f}"}]})
        .mark_text(
            align="left",
            baseline="top",
            dx=-18,  # Adjust this for position
            dy=-40,  # Adjust this for position
        )
        .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
    )
    plot = chart + text

    return plot


tmp_img_corr = correlation_plot(distance_df)
tmp_img_corr.display()
if entry_region_boxplot_plot is not None:
    tmp_img_corr.save(E2_E3_entry_all_muts_plot)

Make boxplot showing median entry by RBP region¶

In [12]:
def make_boxplot_entry_region(df):
    barrel_ranges = {
        "Stalk": list(range(70, 148)),
        "Neck": list(range(148, 166)),
        "Linker": list(range(166, 178)),
        "Head": list(range(178, 603)),
    }
    custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
    empty_charts = []
    for selection in ["CHO-bEFNB2", "CHO-bEFNB3"]:
        tmp_df = df[df["cell_type"] == selection]
        agg_means = []
        for barrel, sites in barrel_ranges.items():
            subset = tmp_df[tmp_df["site"].isin(sites)]
            for _, row in subset.iterrows():
                agg_means.append(
                    {"region": barrel, "effect": row["effect"], "site": row["site"]}
                )
            agg_means_df = pd.DataFrame(agg_means)
        chart = (
            alt.Chart(agg_means_df, title=f"{selection}")
            .mark_boxplot(color='darkgray',extent='min-max')
            .encode(
                x=alt.X(
                    "region:O",
                    sort=custom_order,
                    title="RBP Region",
                    axis=alt.Axis(labelAngle=-90),
                ),
                y=alt.Y(
                    "effect",
                    title="Cell entry",
                    axis=alt.Axis(tickCount=4),
                ),
                tooltip=["region", "effect", "site"],
            ).properties(width=200,height=200)
        )
        empty_charts.append(chart)
    combined_effect_chart = alt.hconcat(*empty_charts).resolve_scale(
        y="shared", x="shared", color="independent"
    )
    return combined_effect_chart


entry_region_boxplot = make_boxplot_entry_region(concat_df)
entry_region_boxplot.display()
if entry_region_boxplot_plot is not None:
    entry_region_boxplot.save(entry_region_boxplot_plot)

Check for potential neutral/beneficial glycosylation sites¶

In [13]:
def find_potential_glycan_sites(df):
    filtered = df[df["wildtype"].isin(["T", "S"])]
    matching_sites = []
    for index, row in filtered.iterrows():
        # Check for existence of site two numbers prior with 'N' mutant and positive effect
        prior_rows = df[
            (df["site"] == row["site"] - 2) & (df["mutant"] == "N") & (df["effect"] > 0)
        ]
        if not prior_rows.empty:
            matching_sites.append(row["site"])
    unique_list1 = list(set(matching_sites))
    unique_list1 = [x - 2 for x in unique_list1]

    filtered = df[df["wildtype"].isin(["N"])]
    matching_sites = []
    for index, row in filtered.iterrows():
        # Check for existence of site two numbers prior with 'N' mutant and positive effect
        prior_rows = df[
            (df["site"] == row["site"] + 2)
            & (df["mutant"].isin(["T", "S"]))
            & (df["effect"] > 0)
        ]
        if not prior_rows.empty:
            matching_sites.append(row["site"])
    unique_list2 = list(set(matching_sites))
    unique_list = unique_list1 + unique_list2
    unique_list.sort()
    return unique_list
#call function
PNLG = find_potential_glycan_sites(func_scores_E3)
#read in surface exposure data to find potential glycans on surface of protein
surface_df = pd.read_csv(surface)
surface_df.rename(columns={"exposure_A": "Surface Exposure"}, inplace=True)
PNLG_SURFACE = surface_df[surface_df["site"].isin(PNLG)]
PNLG_SURFACE = list(
    PNLG_SURFACE[PNLG_SURFACE["Surface Exposure"] >= 25]["site"].unique()
)

#print(f"\nThe surface exposed PNLG sites are: {PNLG_SURFACE}\n")

glycans = config["glycans"] #these are the glycan sites that are already present in protein

#filter out known glycan sites already present
filtered_PNLG_SURFACE = [num for num in PNLG_SURFACE if num not in glycans]
print(f'These are {len(filtered_PNLG_SURFACE)} potential glycan sites one amino acid away: {filtered_PNLG_SURFACE}')
#print(len(filtered_PNLG_SURFACE))
These are 15 potential glycan sites one amino acid away: [180, 191, 192, 311, 326, 359, 383, 403, 423, 473, 478, 543, 554, 570, 600]

Make bar plot of average entry scores for CHO-bEFNB3¶

In [14]:
def entry_by_site(df):
    tmp_df = df.groupby('site')['effect'].mean().reset_index()
    
    # define ranges of different RBP regions
    barrel_ranges = {
        "Stalk": list(range(70, 148)),
        "Neck": list(range(148, 166)),
        "Linker": list(range(166, 178)),
        "Head": list(range(178, 603)),
    }
    
    custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend
    
    agg_means = [] #store aggregation 
    
    # For each barrel, filter the dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = tmp_df[tmp_df["site"].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append(
                {"region": barrel, "effect": row["effect"], "site": row["site"]}
            )
        agg_means_df = pd.DataFrame(agg_means).round(2)
    agg_means_df['beta_sheet'] = agg_means_df['site'].isin(config['beta_sheet']) #add a column specifying which sites are in beta sheets
    
    ### The main chart plotting
    chart = (
        alt.Chart(agg_means_df)
        .mark_bar(opacity=1,color='black',binSpacing=0)
        .encode(
            x=alt.X("site:N", title='Site',axis=alt.Axis(labelAngle=-90,values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
            y=alt.Y("effect", title="Mean entry in CHO-bEFNB3"),
            tooltip=["site", "effect","region"],
            color=alt.Color('region',sort=custom_order,title='Region',scale=alt.Scale(range=['#5778a4', '#e49444', '#d1615d', '#85b6b2'])),
        )
    ).properties(width=alt.Step(1.5),height=150)
    
    ### Draw rectanges showing where beta sheets are in protein above chart
    rect = alt.Chart(agg_means_df.query('beta_sheet == True')).mark_rect(color='black').encode(
        alt.X('site:N',axis=None),
        alt.Y('beta_sheet',axis=None)
    ).properties(width=800,height=10)
    
    combined_chart = alt.vconcat(chart,padding=0).resolve_scale(y='independent',x='shared')
    

    return combined_chart

entry_by_site_plot = entry_by_site(func_scores_E3)
entry_by_site_plot.display()

Make bar charts of mean cell entry by site for different regions, sorted by average and colored by the unmutated amino acid at position¶

In [15]:
def entry_by_site_region(df,site_list,name_of_title):
    #calculate mean by site
    tmp_df = df.groupby(['site','cell_type'])['effect'].mean().reset_index().round(2) 
    
    #make a unique values df to merge
    unique_wildtypes_df = df.drop_duplicates(subset=['site','wildtype','wt_type','wildtype_site'])
    
    #merge mean and unique values
    tmp_df = pd.merge(tmp_df, unique_wildtypes_df[['site', 'wt_type','wildtype','wildtype_site']], on='site', how='left')

    #filter by site
    tmp_df = tmp_df[tmp_df['site'].isin(site_list)]

    #make chart
    chart = (
        alt.Chart(tmp_df,title=name_of_title)
        .mark_bar(opacity=1,color='black')
        .encode(
            y=alt.Y("wildtype_site:N", sort='-x',title='Site'),
            x=alt.X("effect", title="Mean entry of mutants"),
            tooltip=["site", "effect", 'wildtype', 'wt_type'],
            #color=alt.Color('wt_type:N', scale=alt.Scale(scheme='tableau10'), legend=alt.Legend(title="Unmutated amino-acid type"))
        )
    ).properties(width=100, height=alt.Step(14))
    
    return chart

Ranked average entry in contact sites¶

In [16]:
entry_by_site_receptor_plot = entry_by_site_region(func_scores_E3,config['contact_sites'],'Contact Sites')
entry_by_site_receptor_plot.display()

Now make ranked averages for each region¶

Ranked average entry in RBP stalk¶

In [17]:
entry_by_site_receptor_plot_stalk = entry_by_site_region(func_scores_E3,list(range(70, 148)),'Stalk')
entry_by_site_receptor_plot_stalk.display()

Ranked average entry in RBP neck¶

In [18]:
entry_by_site_receptor_plot_neck = entry_by_site_region(func_scores_E3,list(range(148, 166)),'Neck')
entry_by_site_receptor_plot_neck.display()

Ranked average entry in RBP linker¶

In [19]:
entry_by_site_receptor_plot_linker = entry_by_site_region(func_scores_E3,list(range(166, 178)),'Linker')
entry_by_site_receptor_plot_linker.display()
In [20]:
combined_region_barplot = alt.hconcat(entry_by_site_receptor_plot_neck,entry_by_site_receptor_plot,padding=0).resolve_scale(y="independent", x="shared", color="shared")
combined_region_barplot.display()
combined_region_barplot.save(combined_region_barplot_output)
In [ ]:
 
In [ ]: