In [1]:
# this cell is tagged as parameters for `papermill` parameterization
E2_func_infile = None
E2_binding_infile = None
E2_func_output = None
E2_binding_output = None
E3_func_infile = None
E3_binding_infile = None
E3_func_output = None
E3_binding_output = None
HENV26_infile = None
HENV26_output = None
HENV32_infile = None
HENV32_output = None
HENV103_infile = None
HENV103_output = None
HENV117_infile = None
HENV117_output = None
m1024_infile = None
m1024_output = None
nAH13_infile = None
nAH13_output = None
In [2]:
# Parameters
E2_func_infile = "results/filtered_data/entry/e2_entry_filtered.csv"
E2_binding_infile = "results/filtered_data/binding/e2_binding_filtered.csv"
E2_func_output = "results/parsed_mapping_data/E2_entry_mean.csv"
E2_binding_output = "results/parsed_mapping_data/E2_binding_mean.csv"
E3_func_infile = "results/filtered_data/entry/e3_entry_filtered.csv"
E3_binding_infile = "results/filtered_data/binding/e3_binding_filtered.csv"
E3_func_output = "results/parsed_mapping_data/E3_entry_mean.csv"
E3_binding_output = "results/parsed_mapping_data/E3_binding_mean.csv"
HENV26_infile = "results/filtered_data/escape/HENV26_escape_filtered.csv"
HENV32_infile = "results/filtered_data/escape/HENV32_escape_filtered.csv"
HENV103_infile = "results/filtered_data/escape/HENV103_escape_filtered.csv"
HENV117_infile = "results/filtered_data/escape/HENV117_escape_filtered.csv"
m1024_infile = "results/filtered_data/escape/m102_escape_filtered.csv"
nAH13_infile = "results/filtered_data/escape/nAH1_escape_filtered.csv"
HENV26_output = "results/parsed_mapping_data/HENV26_escape_mean.csv"
HENV32_output = "results/parsed_mapping_data/HENV32_escape_mean.csv"
HENV103_output = "results/parsed_mapping_data/HENV103_escape_mean.csv"
HENV117_output = "results/parsed_mapping_data/HENV117_escape_mean.csv"
m1024_output = "results/parsed_mapping_data/m102_escape_mean.csv"
nAH13_output = "results/parsed_mapping_data/nAH1_escape_mean.csv"
Import packages¶
In [3]:
import math
import os
import shutil
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import Bio.SeqIO
import yaml
import csv
import subprocess
Set working directory¶
In [4]:
if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
Setup in correct directory
For running notebook interactively¶
In [5]:
if E2_func_infile is None:
E2_func_infile = 'results/filtered_data/entry/e2_entry_filtered.csv'
E2_func_output = 'results/parsed_mapping_data/E2_entry_mean.csv'
E2_binding_infile = 'results/filtered_data/binding/e2_binding_filtered.csv'
E2_binding_output = 'results/parsed_mapping_data/E2_binding_mean.csv'
E3_func_infile = 'results/filtered_data/entry/e3_entry_filtered.csv'
E3_func_output = 'results/parsed_mapping_data/E3_entry_mean.csv'
E3_binding_infile = 'results/filtered_data/binding/e3_binding_filtered.csv'
E3_binding_output = 'results/parsed_mapping_data/E3_binding_mean.csv'
HENV26_infile = 'results/filtered_data/escape/HENV26_escape_filtered.csv'
HENV26_output = 'results/parsed_mapping_data/HENV26_escape_parsed.csv'
HENV32_infile = 'results/filtered_data/escape/HENV32_escape_filtered.csv'
HENV32_output = 'results/parsed_mapping_data/HENV32_escape_parsed.csv'
HENV103_infile = 'results/filtered_data/escape/HENV103_escape_filtered.csv'
HENV103_output = 'results/parsed_mapping_data/HENV103_escape_mean.csv'
HENV117_infile = 'results/filtered_data/escape/HENV117_escape_filtered.csv'
HENV117_output = 'results/parsed_mapping_data/HENV117_escape_mean.csv'
m1024_infile = 'results/filtered_data/escape/m102_escape_filtered.csv'
m1024_output = 'results/parsed_mapping_data/m102_escape_mean.csv'
nAH13_infile = 'results/filtered_data/escape/nAH1_escape_filtered.csv'
nAH13_output = 'results/parsed_mapping_data/nAH1_escape_mean.csv'
Aggregate cell entry¶
In [6]:
def aggregate_entry_mean(infile,name,outfile):
df = pd.read_csv(infile)
tmp_df = df.groupby('site')['effect'].mean().reset_index()
tmp_df = tmp_df.round(3)
# Modify the dataframe to prepend a tab character and format as strings
tmp_df['site'] = tmp_df['site'].astype(str)
tmp_df['effect'] = tmp_df['effect'].astype(str)
tmp_df['formatted'] = '\t' + ':' + tmp_df['site'] + '\t' + tmp_df['effect']
with open(outfile, 'w') as f:
# Write header lines
f.write(f'attribute: {name}\n')
f.write('match mode: any\n')
f.write('recipient: residues\n')
# Append the tab separated dataframe to the file without quotes and with an escape character
tmp_df['formatted'].to_csv(outfile, sep='\t', index=False, header=False, mode='a')
subprocess.run(['sed', '-i', 's/"//g', outfile], check=True)
# Call function above
aggregate_entry_mean(E2_func_infile,'E2_entry_mean',E2_func_output)
aggregate_entry_mean(E3_func_infile,'E3_entry_mean',E3_func_output)
Aggregate receptor binding¶
In [7]:
def aggregate_binding(infile,name,outfile):
df = pd.read_csv(infile)
test_df = df.groupby('site')['binding_mean'].mean().reset_index()
test_df = test_df.round(3)
# Modify the dataframe to prepend a tab character and format as strings
test_df['site'] = test_df['site'].astype(str)
test_df['binding_mean'] = test_df['binding_mean'].astype(str)
test_df['formatted'] = '\t' + ':' + test_df['site'] + '\t' + test_df['binding_mean']
with open(outfile, 'w') as f:
# Write header lines
f.write(f'attribute: {name}\n')
f.write('match mode: any\n')
f.write('recipient: residues\n')
# Append the tab separated dataframe to the file without quotes and with an escape character
test_df['formatted'].to_csv(outfile, sep='\t', index=False, header=False, mode='a')
subprocess.run(['sed', '-i', 's/"//g', outfile], check=True)
# Call function above
aggregate_binding(E2_binding_infile,'E2_binding_mean',E2_binding_output)
aggregate_binding(E3_binding_infile,'E3_binding_mean',E3_binding_output)
Aggregate antibody escape¶
In [8]:
def aggregate_escape(infile,name,outfile):
df = pd.read_csv(infile)
test_df = df.groupby('site')['escape_mean'].mean().reset_index()
test_df = test_df.round(3)
# Modify the dataframe to prepend a tab character and format as strings
test_df['site'] = test_df['site'].astype(str)
test_df['escape_mean'] = test_df['escape_mean'].astype(str)
test_df['formatted'] = '\t' + ':' + test_df['site'] + '\t' + test_df['escape_mean']
with open(outfile, 'w') as f:
# Write header lines
f.write(f'attribute: {name}\n')
f.write('match mode: any\n')
f.write('recipient: residues\n')
# Append the tab separated dataframe to the file without quotes and with an escape character
test_df['formatted'].to_csv(outfile, sep='\t', index=False, header=False, mode='a')
subprocess.run(['sed', '-i', 's/"//g', outfile], check=True)
# Call function above
aggregate_escape(HENV26_infile,'HENV26_escape_mean',HENV26_output)
aggregate_escape(HENV32_infile,'HENV32_escape_mean',HENV32_output)
aggregate_escape(HENV117_infile,'HENV117_escape_mean',HENV117_output)
aggregate_escape(HENV103_infile,'HENV103_escape_mean',HENV103_output)
aggregate_escape(m1024_infile,'m102_escape_mean',m1024_output)
aggregate_escape(nAH13_infile,'nAH1_escape_mean',nAH13_output)
aggregate escape for receptor binding face antibodies OR dimerization face antibodies¶
In [9]:
### Make aggregated escape for receptor binding face and dimerization face mabs
def aggregate_escape_between_binding(name,outfile):
df1 = pd.read_csv(HENV26_infile)
df2 = pd.read_csv(HENV117_infile)
df3 = pd.read_csv(m1024_infile)
tot_df = pd.concat([df1,df2,df3])
test_df = tot_df.groupby('site')['escape_mean'].mean().reset_index()
# Modify the dataframe to prepend a tab character and format as strings
test_df['site'] = test_df['site'].astype(str)
test_df['escape_mean'] = test_df['escape_mean'].astype(str)
test_df['formatted'] = '\t' + ':' + test_df['site'] + '\t' + test_df['escape_mean']
with open(outfile, 'w') as f:
# Write header lines
f.write(f'attribute: {name}\n')
f.write('match mode: any\n')
f.write('recipient: residues\n')
# Append the tab separated dataframe to the file without quotes and with an escape character
test_df['formatted'].to_csv(outfile, sep='\t', index=False, header=False, mode='a')
subprocess.run(['sed', '-i', 's/"//g', outfile], check=True)
# call function
aggregate_escape_between_binding('binding_face','results/parsed_mapping_data/binding_face.csv')
### Make aggregated escape for receptor binding face and dimerization face mabs
def aggregate_escape_between_dimer(name,outfile):
df1 = pd.read_csv(HENV32_infile)
df2 = pd.read_csv(HENV103_infile)
tot_df = pd.concat([df1,df2])
test_df = tot_df.groupby('site')['escape_mean'].mean().reset_index()
# Modify the dataframe to prepend a tab character and format as strings
test_df['site'] = test_df['site'].astype(str)
test_df['escape_mean'] = test_df['escape_mean'].astype(str)
test_df['formatted'] = '\t' + ':' + test_df['site'] + '\t' + test_df['escape_mean']
with open(outfile, 'w') as f:
# Write header lines
f.write(f'attribute: {name}\n')
f.write('match mode: any\n')
f.write('recipient: residues\n')
# Append the tab separated dataframe to the file without quotes and with an escape character
test_df['formatted'].to_csv(outfile, sep='\t', index=False, header=False, mode='a')
subprocess.run(['sed', '-i', 's/"//g', outfile], check=True)
aggregate_escape_between_dimer('dimer_face','results/parsed_mapping_data/dimer_face.csv')
Make copies that Chimera can use¶
In [10]:
# Specify the directory containing the .csv files
directory = 'results/parsed_mapping_data/'
# Loop through all files in the directory
for filename in os.listdir(directory):
# Check if the file is a .csv file
if filename.endswith('.csv'):
# Generate the new filename by replacing .csv with .defattr
new_filename = filename[:-4] + '.defattr'
# Full path for the source and destination files
src_file = os.path.join(directory, filename)
dest_file = os.path.join(directory, new_filename)
# Copy the content of the .csv file to the new .defattr file
shutil.copy(src_file, dest_file)
print(f"Copied {src_file} to {dest_file}")
Copied results/parsed_mapping_data/HENV103_escape_mean.csv to results/parsed_mapping_data/HENV103_escape_mean.defattr Copied results/parsed_mapping_data/E2_binding_mean.csv to results/parsed_mapping_data/E2_binding_mean.defattr Copied results/parsed_mapping_data/E3_entry_mean.csv to results/parsed_mapping_data/E3_entry_mean.defattr Copied results/parsed_mapping_data/nAH1_escape_mean.csv to results/parsed_mapping_data/nAH1_escape_mean.defattr Copied results/parsed_mapping_data/E3_binding_mean.csv to results/parsed_mapping_data/E3_binding_mean.defattr Copied results/parsed_mapping_data/binding_face.csv to results/parsed_mapping_data/binding_face.defattr Copied results/parsed_mapping_data/E2_entry_mean.csv to results/parsed_mapping_data/E2_entry_mean.defattr Copied results/parsed_mapping_data/m102_escape_mean.csv to results/parsed_mapping_data/m102_escape_mean.defattr Copied results/parsed_mapping_data/HENV117_escape_mean.csv to results/parsed_mapping_data/HENV117_escape_mean.defattr Copied results/parsed_mapping_data/HENV26_escape_mean.csv to results/parsed_mapping_data/HENV26_escape_mean.defattr Copied results/parsed_mapping_data/dimer_face.csv to results/parsed_mapping_data/dimer_face.defattr Copied results/parsed_mapping_data/HENV32_escape_mean.csv to results/parsed_mapping_data/HENV32_escape_mean.defattr
Code below is not currently used, it replaces the b factor in pdb files for mapping if that is desired¶
In [11]:
def replace_b_factor(pdb_input, input):
pd_file = pd.read_csv(input)
df = pd_file.groupby('site')['effect'].mean().reset_index()
# Convert DataFrame to dictionary for faster lookups
site_to_effect = df.set_index('site')['effect'].to_dict()
# Read the PDB file
with open(pdb_input, 'r') as file:
lines = file.readlines()
modified_lines = []
for line in lines:
if line.startswith("ATOM"):
# Extract relevant information
residue_number = line[22:26].strip()
residue_number = np.int64(residue_number)
if residue_number in site_to_effect:
# Get the new beta factor value
new_beta = site_to_effect[residue_number]
# Replace the beta factor in the line
new_line = line[:60] + f"{new_beta:6.2f}" + line[66:]
modified_lines.append(new_line)
else:
new_line = line[:60] + f'{0:6.2f}' + line[66:]
modified_lines.append(new_line)
else:
modified_lines.append(line)
# Write to a new PDB file
with open('modified_7txz_func.pdb', 'w') as file:
file.writelines(modified_lines)
pdb_file_path = 'data/custom_analyses_data/crystal_structures/7txz.pdb'
#replace_b_factor(pdb_file_path,E2_func_infile)
In [ ]: