# this cell is tagged as parameters for `papermill` parameterization
E2_func_infile = None
E2_binding_infile = None
E2_func_output = None
E2_binding_output = None

E3_func_infile = None
E3_binding_infile = None
E3_func_output = None
E3_binding_output = None

HENV26_infile = None
HENV26_output = None
HENV32_infile = None
HENV32_output = None
HENV103_infile = None
HENV103_output = None
HENV117_infile = None
HENV117_output = None
m1024_infile = None
m1024_output = None
nAH13_infile = None
nAH13_output = None

# Parameters
E2_func_infile = "results/filtered_data/entry/e2_entry_filtered.csv"
E2_binding_infile = "results/filtered_data/binding/e2_binding_filtered.csv"
E2_func_output = "results/parsed_mapping_data/E2_entry_mean.csv"
E2_binding_output = "results/parsed_mapping_data/E2_binding_mean.csv"
E3_func_infile = "results/filtered_data/entry/e3_entry_filtered.csv"
E3_binding_infile = "results/filtered_data/binding/e3_binding_filtered.csv"
E3_func_output = "results/parsed_mapping_data/E3_entry_mean.csv"
E3_binding_output = "results/parsed_mapping_data/E3_binding_mean.csv"
HENV26_infile = "results/filtered_data/escape/HENV26_escape_filtered.csv"
HENV32_infile = "results/filtered_data/escape/HENV32_escape_filtered.csv"
HENV103_infile = "results/filtered_data/escape/HENV103_escape_filtered.csv"
HENV117_infile = "results/filtered_data/escape/HENV117_escape_filtered.csv"
m1024_infile = "results/filtered_data/escape/m102_escape_filtered.csv"
nAH13_infile = "results/filtered_data/escape/nAH1_escape_filtered.csv"
HENV26_output = "results/parsed_mapping_data/HENV26_escape_mean.csv"
HENV32_output = "results/parsed_mapping_data/HENV32_escape_mean.csv"
HENV103_output = "results/parsed_mapping_data/HENV103_escape_mean.csv"
HENV117_output = "results/parsed_mapping_data/HENV117_escape_mean.csv"
m1024_output = "results/parsed_mapping_data/m102_escape_mean.csv"
nAH13_output = "results/parsed_mapping_data/nAH1_escape_mean.csv"

import math
import os
import shutil
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import Bio.SeqIO
import yaml
import csv
import subprocess

if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

Setup in correct directory

if E2_func_infile is None:
    E2_func_infile = 'results/filtered_data/entry/e2_entry_filtered.csv'
    E2_func_output = 'results/parsed_mapping_data/E2_entry_mean.csv'
    
    E2_binding_infile = 'results/filtered_data/binding/e2_binding_filtered.csv'
    E2_binding_output = 'results/parsed_mapping_data/E2_binding_mean.csv'
    
    E3_func_infile = 'results/filtered_data/entry/e3_entry_filtered.csv'
    E3_func_output = 'results/parsed_mapping_data/E3_entry_mean.csv'
    
    E3_binding_infile = 'results/filtered_data/binding/e3_binding_filtered.csv'
    E3_binding_output = 'results/parsed_mapping_data/E3_binding_mean.csv'
    

    HENV26_infile = 'results/filtered_data/escape/HENV26_escape_filtered.csv'
    HENV26_output = 'results/parsed_mapping_data/HENV26_escape_parsed.csv'
    
    HENV32_infile = 'results/filtered_data/escape/HENV32_escape_filtered.csv'
    HENV32_output = 'results/parsed_mapping_data/HENV32_escape_parsed.csv'
    
    HENV103_infile = 'results/filtered_data/escape/HENV103_escape_filtered.csv'
    HENV103_output = 'results/parsed_mapping_data/HENV103_escape_mean.csv'
    
    HENV117_infile = 'results/filtered_data/escape/HENV117_escape_filtered.csv'
    HENV117_output = 'results/parsed_mapping_data/HENV117_escape_mean.csv'
    
    m1024_infile = 'results/filtered_data/escape/m102_escape_filtered.csv'
    m1024_output = 'results/parsed_mapping_data/m102_escape_mean.csv'
    
    nAH13_infile = 'results/filtered_data/escape/nAH1_escape_filtered.csv'
    nAH13_output = 'results/parsed_mapping_data/nAH1_escape_mean.csv'

def aggregate_entry_mean(infile,name,outfile):
    df = pd.read_csv(infile)
    tmp_df = df.groupby('site')['effect'].mean().reset_index()
    tmp_df = tmp_df.round(3)
    
    
    # Modify the dataframe to prepend a tab character and format as strings
    tmp_df['site'] = tmp_df['site'].astype(str)
    tmp_df['effect'] = tmp_df['effect'].astype(str)
    tmp_df['formatted'] = '\t' + ':' + tmp_df['site'] + '\t' + tmp_df['effect']
    
    with open(outfile, 'w') as f:
        # Write header lines
        f.write(f'attribute: {name}\n')
        f.write('match mode: any\n')
        f.write('recipient: residues\n')

    # Append the tab separated dataframe to the file without quotes and with an escape character
    tmp_df['formatted'].to_csv(outfile, sep='\t', index=False, header=False, mode='a')
    subprocess.run(['sed', '-i', 's/"//g', outfile], check=True)

# Call function above
aggregate_entry_mean(E2_func_infile,'E2_entry_mean',E2_func_output)
aggregate_entry_mean(E3_func_infile,'E3_entry_mean',E3_func_output)

def aggregate_binding(infile,name,outfile):
    df = pd.read_csv(infile)
    test_df = df.groupby('site')['binding_mean'].mean().reset_index()
    test_df = test_df.round(3)

    # Modify the dataframe to prepend a tab character and format as strings
    test_df['site'] = test_df['site'].astype(str)
    test_df['binding_mean'] = test_df['binding_mean'].astype(str)
    test_df['formatted'] = '\t' + ':' + test_df['site'] + '\t' + test_df['binding_mean']
    
    with open(outfile, 'w') as f:
        # Write header lines
        f.write(f'attribute: {name}\n')
        f.write('match mode: any\n')
        f.write('recipient: residues\n')

    # Append the tab separated dataframe to the file without quotes and with an escape character
    test_df['formatted'].to_csv(outfile, sep='\t', index=False, header=False, mode='a')
    subprocess.run(['sed', '-i', 's/"//g', outfile], check=True)

# Call function above
aggregate_binding(E2_binding_infile,'E2_binding_mean',E2_binding_output)
aggregate_binding(E3_binding_infile,'E3_binding_mean',E3_binding_output)

def aggregate_escape(infile,name,outfile):
    df = pd.read_csv(infile)
    test_df = df.groupby('site')['escape_mean'].mean().reset_index()
    test_df = test_df.round(3)
    
    # Modify the dataframe to prepend a tab character and format as strings
    test_df['site'] = test_df['site'].astype(str)
    test_df['escape_mean'] = test_df['escape_mean'].astype(str)
    test_df['formatted'] = '\t' + ':' + test_df['site'] + '\t' + test_df['escape_mean']
    
    with open(outfile, 'w') as f:
        # Write header lines
        f.write(f'attribute: {name}\n')
        f.write('match mode: any\n')
        f.write('recipient: residues\n')

    # Append the tab separated dataframe to the file without quotes and with an escape character
    test_df['formatted'].to_csv(outfile, sep='\t', index=False, header=False, mode='a')
    subprocess.run(['sed', '-i', 's/"//g', outfile], check=True)

# Call function above
aggregate_escape(HENV26_infile,'HENV26_escape_mean',HENV26_output)
aggregate_escape(HENV32_infile,'HENV32_escape_mean',HENV32_output)
aggregate_escape(HENV117_infile,'HENV117_escape_mean',HENV117_output)
aggregate_escape(HENV103_infile,'HENV103_escape_mean',HENV103_output)
aggregate_escape(m1024_infile,'m102_escape_mean',m1024_output)
aggregate_escape(nAH13_infile,'nAH1_escape_mean',nAH13_output)

### Make aggregated escape for receptor binding face and dimerization face mabs 
def aggregate_escape_between_binding(name,outfile):
    df1 = pd.read_csv(HENV26_infile)
    df2 = pd.read_csv(HENV117_infile)
    df3 = pd.read_csv(m1024_infile)
    tot_df = pd.concat([df1,df2,df3])
    test_df = tot_df.groupby('site')['escape_mean'].mean().reset_index()

    # Modify the dataframe to prepend a tab character and format as strings
    test_df['site'] = test_df['site'].astype(str)
    test_df['escape_mean'] = test_df['escape_mean'].astype(str)
    test_df['formatted'] = '\t' + ':' + test_df['site'] + '\t' + test_df['escape_mean']
    
    with open(outfile, 'w') as f:
        # Write header lines
        f.write(f'attribute: {name}\n')
        f.write('match mode: any\n')
        f.write('recipient: residues\n')

    # Append the tab separated dataframe to the file without quotes and with an escape character
    test_df['formatted'].to_csv(outfile, sep='\t', index=False, header=False, mode='a')
    subprocess.run(['sed', '-i', 's/"//g', outfile], check=True)

# call function
aggregate_escape_between_binding('binding_face','results/parsed_mapping_data/binding_face.csv')

### Make aggregated escape for receptor binding face and dimerization face mabs 
def aggregate_escape_between_dimer(name,outfile):
    df1 = pd.read_csv(HENV32_infile)
    df2 = pd.read_csv(HENV103_infile)

    tot_df = pd.concat([df1,df2])
    test_df = tot_df.groupby('site')['escape_mean'].mean().reset_index()

    # Modify the dataframe to prepend a tab character and format as strings
    test_df['site'] = test_df['site'].astype(str)
    test_df['escape_mean'] = test_df['escape_mean'].astype(str)
    test_df['formatted'] = '\t' + ':' + test_df['site'] + '\t' + test_df['escape_mean']
    
    with open(outfile, 'w') as f:
        # Write header lines
        f.write(f'attribute: {name}\n')
        f.write('match mode: any\n')
        f.write('recipient: residues\n')

    # Append the tab separated dataframe to the file without quotes and with an escape character
    test_df['formatted'].to_csv(outfile, sep='\t', index=False, header=False, mode='a')
    subprocess.run(['sed', '-i', 's/"//g', outfile], check=True)


aggregate_escape_between_dimer('dimer_face','results/parsed_mapping_data/dimer_face.csv')

# Specify the directory containing the .csv files
directory = 'results/parsed_mapping_data/'

# Loop through all files in the directory
for filename in os.listdir(directory):
    # Check if the file is a .csv file
    if filename.endswith('.csv'):
        # Generate the new filename by replacing .csv with .defattr
        new_filename = filename[:-4] + '.defattr'
        
        # Full path for the source and destination files
        src_file = os.path.join(directory, filename)
        dest_file = os.path.join(directory, new_filename)
        
        # Copy the content of the .csv file to the new .defattr file
        shutil.copy(src_file, dest_file)
        print(f"Copied {src_file} to {dest_file}")

Copied results/parsed_mapping_data/HENV103_escape_mean.csv to results/parsed_mapping_data/HENV103_escape_mean.defattr
Copied results/parsed_mapping_data/E2_binding_mean.csv to results/parsed_mapping_data/E2_binding_mean.defattr
Copied results/parsed_mapping_data/E3_entry_mean.csv to results/parsed_mapping_data/E3_entry_mean.defattr
Copied results/parsed_mapping_data/nAH1_escape_mean.csv to results/parsed_mapping_data/nAH1_escape_mean.defattr
Copied results/parsed_mapping_data/E3_binding_mean.csv to results/parsed_mapping_data/E3_binding_mean.defattr
Copied results/parsed_mapping_data/binding_face.csv to results/parsed_mapping_data/binding_face.defattr
Copied results/parsed_mapping_data/E2_entry_mean.csv to results/parsed_mapping_data/E2_entry_mean.defattr
Copied results/parsed_mapping_data/m102_escape_mean.csv to results/parsed_mapping_data/m102_escape_mean.defattr
Copied results/parsed_mapping_data/HENV117_escape_mean.csv to results/parsed_mapping_data/HENV117_escape_mean.defattr
Copied results/parsed_mapping_data/HENV26_escape_mean.csv to results/parsed_mapping_data/HENV26_escape_mean.defattr
Copied results/parsed_mapping_data/dimer_face.csv to results/parsed_mapping_data/dimer_face.defattr
Copied results/parsed_mapping_data/HENV32_escape_mean.csv to results/parsed_mapping_data/HENV32_escape_mean.defattr

def replace_b_factor(pdb_input, input):
    pd_file = pd.read_csv(input)
    df = pd_file.groupby('site')['effect'].mean().reset_index()
    # Convert DataFrame to dictionary for faster lookups
    site_to_effect = df.set_index('site')['effect'].to_dict()
    # Read the PDB file
    with open(pdb_input, 'r') as file:
        lines = file.readlines()

    modified_lines = []
    for line in lines:
        if line.startswith("ATOM"):
            # Extract relevant information
            residue_number = line[22:26].strip()
            residue_number = np.int64(residue_number)            
            if residue_number in site_to_effect:
                # Get the new beta factor value
                new_beta = site_to_effect[residue_number]
                # Replace the beta factor in the line
                new_line = line[:60] + f"{new_beta:6.2f}" + line[66:]
                modified_lines.append(new_line)
            else:
                new_line = line[:60] + f'{0:6.2f}' + line[66:]
                modified_lines.append(new_line)
        else:
            modified_lines.append(line)

    # Write to a new PDB file
    with open('modified_7txz_func.pdb', 'w') as file:
        file.writelines(modified_lines)

pdb_file_path = 'data/custom_analyses_data/crystal_structures/7txz.pdb'
#replace_b_factor(pdb_file_path,E2_func_infile)

mapping_site_level.ipynb¶

This notebook takes filtered data from filter_data.ipynb, and parses it to make .defattr input files for mapping in ChimeraX¶

Import packages¶

Set working directory¶

For running notebook interactively¶

Aggregate cell entry¶

Aggregate receptor binding¶

Aggregate antibody escape¶

aggregate escape for receptor binding face antibodies OR dimerization face antibodies¶

Make copies that Chimera can use¶

Code below is not currently used, it replaces the b factor in pdb files for mapping if that is desired¶