# this cell is tagged as parameters for `papermill` parameterization
### input
nipah_config = None

HENV103_file = None
HENV117_file = None
HENV26_file = None
HENV32_file = None
m102_file = None
nAH1_file = None

e2_entry_file = None
e3_entry_file = None

e2_binding_file = None
e3_binding_file = None

### output
HENV103_filtered_path = None
HENV117_filtered_path = None
HENV26_filtered_path = None 
HENV32_filtered_path = None 
m102_filtered_path = None
nAH1_filtered_path = None

e2_entry_filtered = None
e3_entry_filtered = None
entry_filter_concat = None
RBP_mutation_effects_cell_entry_E2 = None
RBP_mutation_effects_cell_entry_E3 = None

e2_binding_filtered = None
e3_binding_filtered = None
e2_low_binding_effect_filter = None
e3_low_binding_effect_filter = None
RBP_mutation_effects_bEFNB2_binding = None
RBP_mutation_effects_bEFNB3_binding = None

mab_filter_concat_file = None
RBP_mutation_effects_antibody_escape = None
e3_low_mab_effect_filter = None

# Parameters
nipah_config = "nipah_config.yaml"
HENV103_file = "results/antibody_escape/averages/HENV103_mut_effect.csv"
HENV117_file = "results/antibody_escape/averages/HENV117_mut_effect.csv"
HENV26_file = "results/antibody_escape/averages/HENV26_mut_effect.csv"
HENV32_file = "results/antibody_escape/averages/HENV32_mut_effect.csv"
m102_file = "results/antibody_escape/averages/m102.4_mut_effect.csv"
nAH1_file = "results/antibody_escape/averages/nAH1.3_mut_effect.csv"
e2_entry_file = "results/func_effects/averages/CHO_bEFNB2_func_effects.csv"
e3_entry_file = "results/func_effects/averages/CHO_bEFNB3_func_effects.csv"
e2_binding_file = "results/receptor_affinity/averages/bEFNB2_monomeric_mut_effect.csv"
e3_binding_file = "results/receptor_affinity/averages/bEFNB3_dimeric_mut_effect.csv"
e2_entry_filtered = "results/filtered_data/entry/e2_entry_filtered.csv"
e3_entry_filtered = "results/filtered_data/entry/e3_entry_filtered.csv"
entry_filter_merged = "results/filtered_data/entry/e2_e3_entry_filter_merged.csv"
entry_filter_concat = "results/filtered_data/entry/e2_e3_entry_filter_concat.csv"
RBP_mutation_effects_cell_entry_E2 = "results/filtered_data/public_filtered/RBP_mutation_effects_cell_entry_CHO-bEFNB2.csv"
RBP_mutation_effects_cell_entry_E3 = "results/filtered_data/public_filtered/RBP_mutation_effects_cell_entry_CHO-bEFNB3.csv"
e2_binding_filtered = "results/filtered_data/binding/e2_binding_filtered.csv"
e3_binding_filtered = "results/filtered_data/binding/e3_binding_filtered.csv"
RBP_mutation_effects_bEFNB2_binding = (
    "results/filtered_data/public_filtered/RBP_mutation_effects_bEFNB2_binding.csv"
)
RBP_mutation_effects_bEFNB3_binding = (
    "results/filtered_data/public_filtered/RBP_mutation_effects_bEFNB3_binding.csv"
)
e2_low_binding_effect_filter = (
    "results/filtered_data/binding/e2_low_binding_effect_filter.csv"
)
e3_low_binding_effect_filter = (
    "results/filtered_data/binding/e3_low_binding_effect_filter.csv"
)
HENV103_filtered_path = "results/filtered_data/escape/HENV103_escape_filtered.csv"
HENV117_filtered_path = "results/filtered_data/escape/HENV117_escape_filtered.csv"
HENV26_filtered_path = "results/filtered_data/escape/HENV26_escape_filtered.csv"
HENV32_filtered_path = "results/filtered_data/escape/HENV32_escape_filtered.csv"
m102_filtered_path = "results/filtered_data/escape/m102_escape_filtered.csv"
nAH1_filtered_path = "results/filtered_data/escape/nAH1_escape_filtered.csv"
e3_low_mab_effect_filter = "results/filtered_data/escape/e3_low_mab_effect_filter.csv"
mab_filter_concat_file = "results/filtered_data/escape/mab_filter_concat.csv"
RBP_mutation_effects_antibody_escape = (
    "results/filtered_data/public_filtered/RBP_mutation_effects_antibody_escape.csv"
)

import math
import os
import re
import altair as alt
import numpy as np

import pandas as pd

import scipy.stats

import Bio.SeqIO
import yaml

from Bio import PDB
import sys


# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

# setup working directory
if os.getcwd() == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/":
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

#import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append('data/custom_analyses_data/')
import theme
alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')

Setup in correct directory

ThemeRegistry.enable('main_theme')

if nipah_config is None:
    #input
    nipah_config = 'nipah_config.yaml'

    HENV103_file = 'results/antibody_escape/averages/HENV103_mut_effect.csv'
    HENV117_file = 'results/antibody_escape/averages/HENV117_mut_effect.csv'
    HENV26_file = 'results/antibody_escape/averages/HENV26_mut_effect.csv'
    HENV32_file = 'results/antibody_escape/averages/HENV32_mut_effect.csv'
    m102_file = 'results/antibody_escape/averages/m102.4_mut_effect.csv'
    nAH1_file = 'results/antibody_escape/averages/nAH1.3_mut_effect.csv'
    
    e2_entry_file = 'results/func_effects/averages/CHO_bEFNB2_func_effects.csv'
    e3_entry_file = 'results/func_effects/averages/CHO_bEFNB3_func_effects.csv'

    e2_binding_file = 'results/receptor_affinity/averages/bEFNB2_monomeric_mut_effect.csv'
    e3_binding_file = 'results/receptor_affinity/averages/bEFNB3_dimeric_mut_effect.csv'
    
    #output
    e2_entry_filtered = 'results/filtered_data/entry/e2_entry_filtered.csv'
    e3_entry_filtered = 'results/filtered_data/entry/e3_entry_filtered.csv'
    entry_filter_merged = 'results/filtered_data/entry/e2_e3_entry_filter_merged.csv'
    entry_filter_concat = 'results/filtered_data/entry/e2_e3_entry_filter_concat.csv'
    RBP_mutation_effects_cell_entry_E2 = 'results/filtered_data/public_filtered/RBP_mutation_effects_cell_entry_CHO-bEFNB2.csv'
    RBP_mutation_effects_cell_entry_E3 = 'results/filtered_data/public_filtered/RBP_mutation_effects_cell_entry_CHO-bEFNB3.csv'


    e2_binding_filtered = 'results/filtered_data/binding/e2_binding_filtered.csv'
    e3_binding_filtered = 'results/filtered_data/binding/e3_binding_filtered.csv'
    e2_low_binding_effect_filter = 'results/filtered_data/binding/e2_low_binding_effect_filter.csv'
    e3_low_binding_effect_filter = 'results/filtered_data/binding/e3_low_binding_effect_filter.csv'
    RBP_mutation_effects_bEFNB2_binding = 'results/filtered_data/public_filtered/RBP_mutation_effects_bEFNB2_binding.csv'
    RBP_mutation_effects_bEFNB3_binding = 'results/filtered_data/public_filtered/RBP_mutation_effects_bEFNB3_binding.csv'
    
    HENV103_filtered_path = 'results/filtered_data/escape/HENV103_escape_filtered.csv'
    HENV117_filtered_path = 'results/filtered_data/escape/HENV117_escape_filtered.csv'
    HENV26_filtered_path = 'results/filtered_data/escape/HENV26_escape_filtered.csv'
    HENV32_filtered_path = 'results/filtered_data/escape/HENV32_escape_filtered.csv'
    m102_filtered_path = 'results/filtered_data/escape/m102_escape_filtered.csv'
    nAH1_filtered_path = 'results/filtered_data/escape/nAH1_escape_filtered.csv'
    mab_filter_concat_file = 'results/filtered_data/escape/mab_filter_concat.csv'
    e3_low_mab_effect_filter = 'results/filtered_data/escape/e3_low_mab_effect_filter.csv'
    RBP_mutation_effects_antibody_escape = 'results/filtered_data/public_filtered/RBP_mutation_effects_antibody_escape.csv'

with open(nipah_config) as f:
    config = yaml.safe_load(f)

# read in entry files
e2_entry = pd.read_csv(e2_entry_file)
e3_entry = pd.read_csv(e3_entry_file)

# read in binding files
e2_binding = pd.read_csv(e2_binding_file)
e3_binding = pd.read_csv(e3_binding_file)

# read in antibody escape files
HENV103 = pd.read_csv(HENV103_file)
HENV117 = pd.read_csv(HENV117_file)
HENV26 = pd.read_csv(HENV26_file)
HENV32 = pd.read_csv(HENV32_file)
m102 = pd.read_csv(m102_file)
nAH1 = pd.read_csv(nAH1_file)

aa_property_mapping = {
        "A": "Hydrophobic", "V": "Hydrophobic", "L": "Hydrophobic", "I": "Hydrophobic", "M": "Hydrophobic",
        "Y": "Aromatic", "W": "Aromatic", "F": "Aromatic",
        "K": "Positive", "R": "Positive", "H": "Positive",
        "E": "Negative", "D": "Negative",
        "S": "Hydrophilic", "T": "Hydrophilic", "N": "Hydrophilic", "Q": "Hydrophilic",
        "G": "Special", "C": "Special", "P": "Special",
}

codon_table = {
    "ATA": "I",
    "ATC": "I",
    "ATT": "I",
    "ATG": "M",
    "ACA": "T",
    "ACC": "T",
    "ACG": "T",
    "ACT": "T",
    "AAC": "N",
    "AAT": "N",
    "AAA": "K",
    "AAG": "K",
    "AGC": "S",
    "AGT": "S",
    "AGA": "R",
    "AGG": "R",
    "CTA": "L",
    "CTC": "L",
    "CTG": "L",
    "CTT": "L",
    "CCA": "P",
    "CCC": "P",
    "CCG": "P",
    "CCT": "P",
    "CAC": "H",
    "CAT": "H",
    "CAA": "Q",
    "CAG": "Q",
    "CGA": "R",
    "CGC": "R",
    "CGG": "R",
    "CGT": "R",
    "GTA": "V",
    "GTC": "V",
    "GTG": "V",
    "GTT": "V",
    "GCA": "A",
    "GCC": "A",
    "GCG": "A",
    "GCT": "A",
    "GAC": "D",
    "GAT": "D",
    "GAA": "E",
    "GAG": "E",
    "GGA": "G",
    "GGC": "G",
    "GGG": "G",
    "GGT": "G",
    "TCA": "S",
    "TCC": "S",
    "TCG": "S",
    "TCT": "S",
    "TTC": "F",
    "TTT": "F",
    "TTA": "L",
    "TTG": "L",
    "TAC": "Y",
    "TAT": "Y",
    "TAA": "*",
    "TAG": "*",
    "TGC": "C",
    "TGT": "C",
    "TGA": "*",
    "TGG": "W",
}

def filter_entry_df(df,name):
    #first, remove wildtype observations, we don't want to include them in counts or filtered data
    df = df.dropna()
    
    #function to filter cell entry dataframes
    print(f'We are analyzing data for {name}:')
    
    #calculate how many selections were done in total
    max_sels = df["n_selections"].max()
    num_sels_cutoff = (max_sels * config['percent_selections'])
    print(f"The number of selections a mutant must be observed in is: {num_sels_cutoff}")

    #first filter mutants we don't care about and don't want to report
    filtered_df = df[
        (~df['mutant'].isin(['-','*']))
        & (df['site'] != 603)
    ]
    print(f'There are {filtered_df.shape[0]} mutations present before filtering')

    # Now do the actual filtering we care about. Do one step at time so we can report filtering stats

    # filter by times seen first
    times_seen_filter_df = filtered_df[filtered_df["times_seen"] >= config["func_times_seen_cutoff"]]
    print(f'There are {filtered_df.shape[0] - times_seen_filter_df.shape[0]} mutations removed for low times seen')

    # now filter by std
    effect_std_filter_df = times_seen_filter_df[times_seen_filter_df['effect_std'] <= config['func_std_cutoff']]
    print(f'There are {times_seen_filter_df.shape[0] - effect_std_filter_df.shape[0]} mutations removed for high std variance')

    # now filter by whether mutations are present in enough selections
    num_selections_filter_df = effect_std_filter_df[effect_std_filter_df['n_selections'] >= num_sels_cutoff]
    print(f'There are {effect_std_filter_df.shape[0] - num_selections_filter_df.shape[0]} mutations removed for not being present in enough selections')

    filter_df = num_selections_filter_df.copy()
    # make a useful combination of string column for plotting later
    filter_df['wildtype_site'] = filter_df['wildtype'].astype(str) + filter_df['site'].astype(str)
    # add residue type information to df
    filter_df['wt_type'] = filter_df['wildtype'].map(aa_property_mapping)
    filter_df['mutant_type'] = filter_df['mutant'].map(aa_property_mapping)

    print(f'The number of mutations present after all the filtering is {filter_df.shape[0]}, which is {round(filter_df.shape[0] / (532*19), 2)} of the theoretical total\n')
    return filter_df

#call function above. e2_filter_df and e3_filter_df have been filtered
#before calling function, assign a new column to keep track of what selection it was
e2_entry["cell_type"] = "CHO-bEFNB2"
e3_entry["cell_type"] = "CHO-bEFNB3"

e2_filter_df = filter_entry_df(e2_entry,'CHO-bEFNB2')
e3_filter_df = filter_entry_df(e3_entry,'CHO-bEFNB3')

We are analyzing data for CHO-bEFNB2:
The number of selections a mutant must be observed in is: 4.0
There are 10048 mutations present before filtering
There are 217 mutations removed for low times seen
There are 44 mutations removed for high std variance
There are 1 mutations removed for not being present in enough selections
The number of mutations present after all the filtering is 9786, which is 0.97 of the theoretical total

We are analyzing data for CHO-bEFNB3:
The number of selections a mutant must be observed in is: 3.5
There are 10074 mutations present before filtering
There are 183 mutations removed for low times seen
There are 115 mutations removed for high std variance
There are 63 mutations removed for not being present in enough selections
The number of mutations present after all the filtering is 9713, which is 0.96 of the theoretical total

# make accessible .csv file to share that contains very specific information
public_e2_entry = e2_filter_df.drop(columns={'effect_std','times_seen','n_selections','cell_type','wt_type','mutant_type','wildtype_site'})
public_e2_entry = public_e2_entry.rename(columns={'effect':'entry_CHO_bEFNB2'})
public_e2_entry.to_csv(RBP_mutation_effects_cell_entry_E2,index=False)

public_e3_entry = e3_filter_df.drop(columns={'effect_std','times_seen','n_selections','cell_type','wt_type','mutant_type','wildtype_site'})
public_e3_entry = public_e3_entry.rename(columns={'effect':'entry_CHO_bEFNB3'})
public_e3_entry.to_csv(RBP_mutation_effects_cell_entry_E3,index=False)

#save filtered files
e2_filter_df.to_csv(e2_entry_filtered,index=False)
e3_filter_df.to_csv(e3_entry_filtered,index=False)

#make concat version
concat_df = pd.concat([e2_filter_df, e3_filter_df])
concat_df.to_csv(entry_filter_concat,index=False)



#make merged version
merged_df = pd.merge(
    e2_filter_df,
    e3_filter_df,
    on=["site", "mutant", "wildtype"],
    how="outer",
    suffixes=["_E2", "_E3"],
)
merged_df.to_csv(entry_filter_merged,index=False)

def filter_mab_escape(df,name,filtered_output):
    #filter antibody escape by CHO-bEFNB3 functional scores
    tmp_df = df.merge(e3_filter_df, on=['site','wildtype','mutant'],how='left',suffixes=['_ab','_func_effects'])
    
    #filter antibody escape data
    tmp_df = tmp_df[
        (~tmp_df['mutant'].isin(['-','*']))
        & (tmp_df['site'] != 603) 
        & (tmp_df['times_seen_ab'] >= config['min_times_seen_ab']) 
        & (tmp_df['frac_models'] >= config['frac_models']) 
        & (tmp_df['effect'] >= config['min_func_effect_for_ab'])  
    ]
    tmp_df['ab'] = name

    # Now identify top escape sites of either a percentage or the max or sum escape score at a site
    #find max escape
    print(name)
    max_escape = tmp_df['escape_mean'].max()
    max_filter_df = tmp_df[tmp_df['escape_mean'] > max_escape * 0.5]
    max_site_list = max_filter_df['site'].unique().tolist()
    print(f' The sites with the top max escape are {max_site_list}')
    
    #find top summed escape sites
    sum_df = tmp_df.groupby('site')['escape_mean'].sum().reset_index()
    max_sum = sum_df['escape_mean'].max()
    sum_filter_df = sum_df[sum_df['escape_mean'] > max_sum * 0.75]
    sum_site_list = sum_filter_df['site'].unique().tolist()
    print(f' The sites with the top summed escape are {sum_site_list}')
    
    #combine max and summed into same list and add top escape site to column
    combined_sites_list = list(set(max_site_list + sum_site_list))
    tmp_df['top_escape'] = tmp_df['site'].isin(combined_sites_list)
    
    #add column with just wildtype and site information
    tmp_df['wildtype_site'] = tmp_df['wildtype'].astype(str) + tmp_df['site'].astype(str)
    
    # add residue type information to df
    tmp_df['wt_type'] = tmp_df['wildtype'].map(aa_property_mapping)
    tmp_df['mutant_type'] = tmp_df['mutant'].map(aa_property_mapping)
    
    # Save filtered data
    tmp_df.to_csv(filtered_output, index=False)
    
    return tmp_df

# call function above and filter each antibody separately
HENV103_df = filter_mab_escape(HENV103,'HENV-103', HENV103_filtered_path)
HENV117_df = filter_mab_escape(HENV117,'HENV-117', HENV117_filtered_path)
HENV26_df = filter_mab_escape(HENV26,'HENV-26', HENV26_filtered_path)
HENV32_df = filter_mab_escape(HENV32,'HENV-32', HENV32_filtered_path)
m102_df = filter_mab_escape(m102,'m102.4', m102_filtered_path)
nAH1_df = filter_mab_escape(nAH1,'nAH1.3', nAH1_filtered_path)

# now make a concatanated escape dataframe containing all antibodies
mab_filter_concat = pd.concat([HENV103_df,HENV117_df,HENV26_df,HENV32_df,m102_df,nAH1_df])
# only include some columns in the final filtered data
mab_filter_concat = mab_filter_concat[['site','wildtype','mutant','mutation','escape_mean','escape_median','escape_std','times_seen_ab','frac_models','effect','effect_std','times_seen_func_effects','ab','top_escape','wildtype_site','wt_type','mutant_type']]


### Now want to assign columns with how many mutations a mutation away is from 'wt' nipah sequence

# Load in wt nucleotide sequence (which is different than the 'wt' sequence from unmutated library as it was codon optimized)
niv_m_wt = str(
    Bio.SeqIO.read(
        "data/custom_analyses_data/alignments/wild_type_seq.fasta", "fasta"
    ).seq
)

# Function to find closest codon
def find_closest_codon(wt_codon, mutant_aa):
    mutant_codons = [codon for codon, aa in codon_table.items() if aa == mutant_aa]
    min_mutations = 3  # Maximum mutations possible
    closest_codon = None
    for m_codon in mutant_codons:
        mutations = sum([1 for c1, c2 in zip(wt_codon, m_codon) if c1 != c2])
        if mutations < min_mutations:
            min_mutations = mutations
            closest_codon = m_codon
    return closest_codon, min_mutations

# Function to extract codon for a given site
def extract_codon(site):
    idx = (site - 1) * 3
    return niv_m_wt[idx : idx + 3]

# Function to apply minimum mutation information to concat dataframe
def apply_codon_to_df(df, extract_func):
    df["wt_codon"] = df["site"].apply(extract_func)
    df["closest_mutant_codon"] = df.apply(
        lambda row: find_closest_codon(row["wt_codon"], row["mutant"])[0], axis=1
    )
    df["min_mutations"] = df.apply(
        lambda row: find_closest_codon(row["wt_codon"], row["mutant"])[1], axis=1
    )
    return df

# call functions above
combined_df = apply_codon_to_df(mab_filter_concat, extract_codon)
# save to file
combined_df.to_csv(mab_filter_concat_file,index=False) #save this final concatanated data that contains number of mutations away each escape mutant is from the wildtype Nipah malaysia sequence

HENV-103
 The sites with the top max escape are [258, 259, 260, 264]
 The sites with the top summed escape are [205, 260]
HENV-117
 The sites with the top max escape are [258, 555, 580, 583, 586, 587]
 The sites with the top summed escape are [171, 580, 582, 586]
HENV-26
 The sites with the top max escape are [491, 494, 501, 530]
 The sites with the top summed escape are [491, 501, 530]
HENV-32
 The sites with the top max escape are [199, 200, 205]
 The sites with the top summed escape are [199]
m102.4
 The sites with the top max escape are [239, 507, 559, 577, 582, 586, 589]
 The sites with the top summed escape are [239, 582]
nAH1.3
 The sites with the top max escape are [184, 185, 186, 187, 188, 189, 190, 447, 448, 449, 450, 451, 452, 468, 513, 515, 516, 517, 518, 519, 520, 597]
 The sites with the top summed escape are [185, 450, 468, 517]

# make accessible .csv file to share that contains very specific information
public_escape = combined_df.drop(columns={'mutation','escape_median','escape_std','times_seen_ab','frac_models','effect','effect_std','times_seen_func_effects','top_escape','wildtype_site','wt_type','mutant_type','wt_codon','closest_mutant_codon','min_mutations'})
public_escape = public_escape.rename(columns={'ab':'antibody'})
public_escape.to_csv(RBP_mutation_effects_antibody_escape,index=False)

### Antibody low effect
#e3 entry
e3_mab_low_effect = e3_filter_df[
    (e3_filter_df["effect"] < config["min_func_effect_for_ab"])
]

e3_mab_low_effect.to_csv(e3_low_mab_effect_filter,index=False)
print(f'For antibodies, {e3_mab_low_effect.shape[0]} mutations will be masked in heatmaps due to low functional effects')

For antibodies, 2670 mutations will be masked in heatmaps due to low functional effects

def merge_func_binding_dfs(func, binding, name):
    # Merge the 'binding' and 'func' DataFrames on specified columns with outer join.
    # This combines data based on 'site', 'mutant', and 'wildtype' columns.
    # Suffixes are added to columns with identical names in both DataFrames for differentiation.

    #first, filter mutants we don't care about
    binding = binding[
        (~binding['mutant'].isin(["*", "-"]))
        & (binding['site'] != 603)
    ]
    print(f'There are a total of {binding.shape[0]} mutants for {name}')

    # Now merge binding with cell entry dataframe (we need to filter out mutants with bad entry)
    df_int = pd.merge(
        binding,
        func,
        on=["site", "mutant", "wildtype"],
        suffixes=["_binding", "_cell_entry"],
        validate="one_to_one",
        how="left",
    )
    # Rename specific columns for clarity.
    df = df_int.rename(
        columns={
            "Ephrin binding_mean": "binding_mean",
            "Ephrin binding_std": "binding_std",
            "Ephrin binding_median": "binding_median",
        }
    )
    #assign amino acid type to mutants
    df['mutant_type'] = df['mutant'].map(aa_property_mapping)

    # Select and reorder a subset of columns for the final DataFrame.
    df = df[
        [
            "site",
            "wildtype",
            "mutant",
            "binding_mean",
            "binding_std",
            "times_seen_binding",
            "effect",
            "effect_std",
            "times_seen_cell_entry",
            "frac_models",
            'mutant_type'
        ]
    ]
    # Define a function to filter the DataFrame based on several criteria defined in Nipah_config.yaml.
    def filter_binding_data(df):
        # Apply multiple conditions to filter the data for relevant entries.
        # First, filter by binding parameters
        df_filter_first = df[
            (df["times_seen_binding"] >= config["min_times_seen_binding"])
            & (df["binding_std"] <= config["max_binding_std"])
            & (df["frac_models"] >= config["frac_models"])
        ]
        # how many mutants did this filtering remove?
        print(f'Filtering sites by binding times seen,frac_models, and std, removes {binding.shape[0] - df_filter_first.shape[0]} mutants for {name}')
        
        # Now, filter out sites with bad cell entry
        df_filter_second = df_filter_first[
            (df_filter_first["effect"] >= config["min_func_effect_for_binding"])
        ]
        print(f'Filtering by low cell entry, there are {df_filter_first.shape[0] - df_filter_second.shape[0]} mutants removed for {name}')
        sites = df_filter_second['site'].unique()
        print(f'And the total number of sites is {len(sites)}')

        # now make dataframes for bad cell entry to be used in heatmaps in other notebooks
        df_filter_bad = df_filter_first[df_filter_first['effect'] < config['min_func_effect_for_binding']]
    

        return df_filter_second, df_filter_bad
    
    
    # filter the DataFrame using the defined function.
    df_filter, df_bad_entry = filter_binding_data(df)
    print(df_filter.shape[0])
    # Save the filtered data to CSV files
    if name == "EFNB2":
        df_filter.to_csv(e2_binding_filtered, index=False)
        df_bad_entry.to_csv(e2_low_binding_effect_filter,index=False)
    else:
        df_filter.to_csv(e3_binding_filtered, index=False)
        df_bad_entry.to_csv(e3_low_binding_effect_filter,index=False)
    return df_filter, df_bad_entry

# Use the defined function to filter data for EFNB2 and EFNB3.
e2_filter, e2_bad = merge_func_binding_dfs(e2_filter_df, e2_binding, "EFNB2")
e3_filter, e3_bad = merge_func_binding_dfs(e3_filter_df, e3_binding, "EFNB3")

There are a total of 10043 mutants for EFNB2
Filtering sites by binding times seen,frac_models, and std, removes 1722 mutants for EFNB2
Filtering by low cell entry, there are 1649 mutants removed for EFNB2
And the total number of sites is 508
6672
There are a total of 9932 mutants for EFNB3
Filtering sites by binding times seen,frac_models, and std, removes 1929 mutants for EFNB3
Filtering by low cell entry, there are 1500 mutants removed for EFNB3
And the total number of sites is 503
6503

e2_public_binding = e2_filter.drop(columns={'binding_std','times_seen_binding','effect','effect_std','times_seen_cell_entry','frac_models','mutant_type'})
e2_public_binding = e2_public_binding.rename(columns={'binding_mean':'bEFNB2_binding'})
e2_public_binding.to_csv(RBP_mutation_effects_bEFNB2_binding,index=False)

e3_public_binding = e3_filter.drop(columns={'binding_std','times_seen_binding','effect','effect_std','times_seen_cell_entry','frac_models','mutant_type'})
e3_public_binding = e3_public_binding.rename(columns={'binding_mean':'bEFNB3_binding'})
e3_public_binding.to_csv(RBP_mutation_effects_bEFNB3_binding,index=False)

def prepare_df(df_binding,df_entry):
    #fix names
    df_tmp = df_binding.rename(
        columns={
            "Ephrin binding_mean": "binding_mean",
            "Ephrin binding_std": "binding_std",
            "Ephrin binding_median": "binding_median",
        }
    )

    #filter out stops and site 603
    df_tmp = df_tmp[
        (~df_tmp['mutant'].isin(["*", "-"]))
        & (df_tmp['site'] != 603)
    ]
    #filter by times seen and standard deviation between measurements
    df_tmp = df_tmp[
            (df_tmp["times_seen"] >= config["min_times_seen_binding"])
            & (df_tmp["binding_std"] <= config["max_binding_std"])
            #& (df_tmp["effect"] >= config["min_func_effect_for_binding"]) 
    ]

    #now merge with functional effect
    binding_measured = pd.merge(df_entry,df_tmp[['site','wildtype','mutant','binding_mean']],on=['site','wildtype','mutant'],how='left')
    binding_measured['binding_measurement'] = binding_measured['binding_mean'].notna()
    #display(binding_measured.query('binding_measurement == True'))
    return binding_measured

e2_merged = prepare_df(e2_binding,e2_filter_df)
e3_merged = prepare_df(e3_binding,e3_filter_df)

def make_binding_effect_boxplot(df,title):
    df['binding_measurement'] = df['binding_measurement'].astype(str)
    
    chart = (
        alt.Chart(df,title=f'{title}')
        .mark_boxplot(color='darkgray',extent='min-max',size=25)
        .encode(
            x=alt.X(
                "binding_measurement:O",
                title='Binding Measured',
                sort=['True','False'],
                axis=alt.Axis(labelAngle=-90),
            ),
            y=alt.Y(
                "effect",
                title="Cell entry",
                axis=alt.Axis(tickCount=4),
            ),
        ).properties(width=alt.Step(50),height=200)
    )
    return chart

e2_binding_plot = make_binding_effect_boxplot(e2_merged,'bEFNB2')
e3_binding_plot = make_binding_effect_boxplot(e3_merged,'bEFNB3')
combined_plot = alt.hconcat(e2_binding_plot,e3_binding_plot).resolve_scale(y='shared')
combined_plot.display()

#Pull out specific mutations
def specific_muts(df):
    tmp_df = df[
        (df['mutant'] == 'G')
        & (df['site'] == 589)
    ]
    display(tmp_df)

specific_muts(e2_binding)
specific_muts(e3_binding)

filter_data.ipynb¶

This script will filter entry, binding, and escape DMS data for analysis based on parameters set in nipah_config.yaml¶

These are the parameters that are used for filtering¶

Functional selection filtering parameters¶

Binding filtering parameters¶

Antibody escape filtering parameters¶

Papermill parameters¶

Import packages, set working directory, import altair theme¶

Hard paths if running notebook interactively instead of with snakemake/papermill¶

Load config file that contains all the filtering parameters¶

Import all raw data for entry, binding, and escape¶

Setup two different dictionaries to apply to filtered data frames¶

Filter cell entry and write files¶

Now that we have filtered the cell entry data, lets save it to use in other notebooks, and also make some merged and concatanated versions of these dataframes¶

Filter escape scores¶

Make filtered dataframes for low effect mutants that will be shown as filtered out in heatmaps¶

Filter binding data¶

Make plot showing distribution of effects on cell entry for different types of binding mutants¶