receptor_distance¶
Find distances of each RBP residue to either Ephrin-B2 or -B3, and find contact sites
- Written by Brendan Larsen
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
e2_pdb = None
e3_pdb = None
rbp_pdb = None
ephrin_b2_close_residues = None
ephrin_b3_close_residues = None
ephrin_b2_distance = None
ephrin_b3_distance = None
dimerization_distance = None
In [2]:
# Parameters
e2_pdb = "data/custom_analyses_data/crystal_structures/2vsm.pdb"
e3_pdb = "data/custom_analyses_data/crystal_structures/3d12.pdb"
rbp_pdb = "data/custom_analyses_data/crystal_structures/7txz.pdb"
ephrin_b2_close_residues = "results/distances/2vsm_close_residues.csv"
ephrin_b3_close_residues = "results/distances/3d12_close_residues.csv"
ephrin_b2_distance = "results/distances/2vsm_distances.csv"
ephrin_b3_distance = "results/distances/3d12_distances.csv"
dimerization_distance = "results/distances/7txz_distances.csv"
In [3]:
import math
import os
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import Bio.SeqIO
from Bio import AlignIO
from Bio import PDB
from Bio.Align import PairwiseAligner
from Bio.PDB.DSSP import DSSP
from Bio.PDB import PDBParser
import xml.etree.ElementTree as ET
# pd.set_option('display.max_rows', None)
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
if (
os.getcwd()
== "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
):
pass
print("Already in correct directory")
else:
os.chdir(
"/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
)
print("Setup in correct directory")
Setup in correct directory
For running interactively¶
In [5]:
if ephrin_b2_close_residues is None:
e2_pdb = "data/custom_analyses_data/crystal_structures/2vsm.pdb"
e3_pdb = "data/custom_analyses_data/crystal_structures/3d12.pdb"
rbp_pdb = "data/custom_analyses_data/crystal_structures/7txz.pdb"
ephrin_b2_close_residues = "results/distances/2vsm_close_residues.csv"
ephrin_b3_close_residues = "results/distances/3d12_close_residues.csv"
ephrin_b2_distance = "results/distances/2vsm_distances.csv"
ephrin_b3_distance = "results/distances/3d12_distances.csv"
dimerization_distance = "results/distances/7txz_distances.csv"
Check whether data already exists¶
In [6]:
# check whether output directory exists
def create_directory(directory_path):
# Check if the directory exists
if not os.path.exists(directory_path):
# Create the directory
os.makedirs(directory_path)
print(f"Directory '{directory_path}' created.")
else:
print(f"Directory '{directory_path}' already exists.")
# Example usage
directory_path = "results/distances/"
create_directory(directory_path)
Directory 'results/distances/' already exists.
Calculate amino acid distances of RBP to Ephrins¶
First calculate how many receptor residues are within cutoff distance¶
In [7]:
cutoff_distance = 4
In [8]:
def calculate_nearby_residues(
pdb_path, source_chain_id, target_chain_ids, name, cutoff_distance
):
# Initialize the PDB parser and load the structure from the given pdb_path
parser = PDB.PDBParser(QUIET=True)
structure = parser.get_structure("structure_id", pdb_path)
# Retrieve the source chain and target chains from the structure
source_chain = structure[0][source_chain_id]
target_chains = [structure[0][chain_id] for chain_id in target_chain_ids]
data = []
# Loop through each residue in the source chain
for residueA in source_chain:
# Skip water and common non-protein residues
if residueA.resname in ["HOH", "WAT", "IPA", "NAG", "SO4"]:
continue
nearby_residues = []
# Compare against residues in each target chain
for target_chain in target_chains:
for residueB in target_chain:
# Skip water and common non-protein residues in target chains as well
if residueB.resname in ["HOH", "WAT", "IPA", "SO4"]:
continue
is_within_cutoff = False
# Calculate distance between atoms in the two residues
for atomA in residueA:
for atomB in residueB:
distance = atomA - atomB
if distance < cutoff_distance:
is_within_cutoff = True
break
if is_within_cutoff:
break
# If within cutoff, add to nearby residues list
if is_within_cutoff:
nearby_residues.append(
{
"chain": target_chain.get_id(),
"residue_id": residueB.id[1],
"residue_name": residueB.resname,
"distance": distance,
}
)
# Append collected data for each residue in source chain
data.append(
{
"wildtype": residueA.resname,
"site": residueA.id[1],
"nearby_residues": nearby_residues,
"custom_source": name,
}
)
# Convert collected data into a pandas DataFrame for easier manipulation
df = pd.DataFrame(data)
return df
# Example usage of the function with specified parameters
pdb_path_2VSM = e2_pdb
source_chain_2VSM = "A"
target_chains_2VSM = ["B"]
pdb_path_3D12 = e3_pdb
source_chain_3D12 = "A"
target_chains_3D12 = "B"
# Calculate nearby residues and add custom columns to summarize data
df_2VSM_close = calculate_nearby_residues(
pdb_path_2VSM, source_chain_2VSM, target_chains_2VSM, "2VSM_source", cutoff_distance
)
df_2VSM_close["number_of_contact_residues_within_5"] = df_2VSM_close[
"nearby_residues"
].apply(len)
df_2VSM_close["close_residues"] = df_2VSM_close["nearby_residues"].apply(
lambda x: ", ".join([str(item["residue_id"]) for item in x]) if x else None
)
df_3D12_close = calculate_nearby_residues(
pdb_path_3D12, source_chain_3D12, target_chains_3D12, "3D12_source", cutoff_distance
)
df_3D12_close[f"number_of_contact_residues_within_{cutoff_distance}"] = df_3D12_close[
"nearby_residues"
].apply(len)
df_3D12_close["close_residues"] = df_3D12_close["nearby_residues"].apply(
lambda x: ", ".join([str(item["residue_id"]) for item in x]) if x else None
)
# Adjust numbering for specific residues based on criteria
def adjust_residues(residue_str):
if residue_str is None:
return None
residues = [int(residue_id) for residue_id in residue_str.split(", ")]
adjusted_residues = [
residue_id - 3 if residue_id < 68 else residue_id for residue_id in residues
]
return ", ".join(map(str, adjusted_residues))
df_2VSM_close["close_residues"] = df_2VSM_close["close_residues"].apply(adjust_residues)
df_2VSM_close.to_csv(ephrin_b2_close_residues, index=False)
df_3D12_close.to_csv(ephrin_b3_close_residues, index=False)
Now find distances to all residues
In [9]:
def calculate_min_distances(pdb_path, source_chain_id, target_chain_ids, name):
"""
Calculate the minimum distance between residues in a source chain and residues in target chains.
Args:
pdb_path (str): Path to the PDB file.
source_chain_id (str): ID of the source chain.
target_chain_ids (list): List of IDs for the target chains.
name (str): A custom name for the data source.
Returns:
pandas.DataFrame: A DataFrame containing the minimum distances and related information.
"""
# Initialize the PDB parser and load the structure from the given pdb_path
parser = PDB.PDBParser(QUIET=True)
structure = parser.get_structure("structure_id", pdb_path)
# Retrieve the source chain and target chains from the structure
source_chain = structure[0][source_chain_id]
target_chains = [structure[0][chain_id] for chain_id in target_chain_ids]
data = []
# Loop through each residue in the source chain
for residueA in source_chain:
# Skip water and common non-protein residues
if residueA.resname in ["HOH", "WAT", "IPA", "NAG", "SO4"]:
continue
# Initialize variables to track the closest residue and its distance
min_distance = float("inf")
closest_residueB = None
closest_chain_id = None
residues_within_4 = 0 # Count of residues within 4 angstroms
# Compare against residues in each target chain
for target_chain in target_chains:
for residueB in target_chain:
if residueB.resname in ["HOH", "WAT", "IPA", "SO4"]:
continue
# Check for minimum distance and residues within 4 angstroms
is_within_4 = False
for atomA in residueA:
for atomB in residueB:
distance = atomA - atomB
if distance < min_distance:
min_distance = distance
closest_residueB = residueB
closest_chain_id = target_chain.get_id()
if distance < 4:
is_within_4 = True
if is_within_4:
residues_within_4 += 1
# Append collected data for each residue in source chain
data.append(
{
"wildtype": residueA.resname,
"site": residueA.id[1],
"chain": closest_chain_id,
"residue": closest_residueB.id[1],
"residue_name": closest_residueB.resname,
"distance": min_distance,
"residues_within_4": residues_within_4,
"custom_source": name,
}
)
# Convert collected data into a pandas DataFrame for easier manipulation
df = pd.DataFrame(data)
return df
# Usage examples with specified parameters for different PDB structures
pdb_path_2VSM = e2_pdb
source_chain_2VSM = "A"
target_chains_2VSM = "B"
pdb_path_3D12 = e3_pdb
source_chain_3D12 = "A"
target_chains_3D12 = "B"
pdb_path_7txz = rbp_pdb
source_chain_7txz = "B"
target_chains_7txz = "A"
#pdb_path_7txz = rbp_pdb
source_chain_7txz_A = "A"
target_chains_7txz_B = "B"
# Dictionary to convert three-letter amino acid codes to one-letter
three_to_one_letter = {
"ALA": "A",
"ARG": "R",
"ASN": "N",
"ASP": "D",
"CYS": "C",
"GLU": "E",
"GLN": "Q",
"GLY": "G",
"HIS": "H",
"ILE": "I",
"LEU": "L",
"LYS": "K",
"MET": "M",
"PHE": "F",
"PRO": "P",
"SER": "S",
"THR": "T",
"TRP": "W",
"TYR": "Y",
"VAL": "V",
}
#Calculate minimum distances and modify dataframes with additional information
df_2VSM = calculate_min_distances(
pdb_path_2VSM, source_chain_2VSM, target_chains_2VSM, "E2"
)
df_2VSM["E2_PDB_residue"] = df_2VSM["residue_name"].replace(three_to_one_letter)
# Adjust residue numbering for specific cases
df_2VSM["residue"] = np.where(
df_2VSM["residue"] < 68, df_2VSM["residue"] - 3, df_2VSM["residue"]
)
df_3D12 = calculate_min_distances(
pdb_path_3D12, source_chain_3D12, target_chains_3D12, "E3"
)
df_3D12["E3_PDB_residue"] = df_3D12["residue_name"].replace(three_to_one_letter)
df_7txz = calculate_min_distances(
pdb_path_7txz, 'A', 'B', "dimerization"
)
df_7txz["dimerization_PDB_residue"] = df_7txz["residue_name"].replace(
three_to_one_letter
)
df_7txz_assm = calculate_min_distances(
pdb_path_7txz, 'B', 'A', "dimerization"
)
df_7txz_assm["dimerization_PDB_residue"] = df_7txz_assm["residue_name"].replace(
three_to_one_letter
)
EFNB2_dist = calculate_min_distances(
pdb_path_2VSM, 'B','A','EFNB2'
)
#EFNB2_dist["site"] = np.where(
# EFNB2_dist["site"] < 68, EFNB2_dist["site"] - 3, EFNB2_dist["site"]
#)
EFNB3_dist = calculate_min_distances(
pdb_path_3D12, 'B','A','EFNB3'
)
print("All done!")
# Save the processed data to CSV files
df_2VSM.to_csv(ephrin_b2_distance, index=False)
df_3D12.to_csv(ephrin_b3_distance, index=False)
df_7txz.to_csv(dimerization_distance, index=False)
All done!
Which residues are close to receptor?¶
In [10]:
def find_close_residues(df, distance_cutoff):
close = df[df["distance"] <= distance_cutoff]
unique_sites = list(close["site"].unique())
return unique_sites
ephrin_b2_close = find_close_residues(df_2VSM, 4)
print(f'For 2VSM (RBP/Ephrin-B2), the close contact sites are: {ephrin_b2_close}\n')
ephrin_b3_close = find_close_residues(df_3D12, 4)
print(f'For 3D12 (RBP/Ephrin-B3), the close contact sites are: {ephrin_b3_close}\n')
dimerization = find_close_residues(df_7txz, 4)
print(f' For 7TXZ Chain A, close contact sites are: {dimerization}')
dimerization_assm = find_close_residues(df_7txz_assm,4)
print(f' For 7TXZ Chain B, close contact sites are : {dimerization_assm}')
EFNB2_close = find_close_residues(EFNB2_dist,4)
print(f' For EFNB2, close contact sites are : {EFNB2_close}')
EFNB3_close = find_close_residues(EFNB3_dist,4)
print(f' For EFNB3, close contact sites are : {EFNB3_close}')
# Function for finding the combined values for two different lists
def find_combined_sites(list1, list2):
combined_list = list(list1) + list(list2)
unique_elements = set(combined_list)
sorted_list = sorted(unique_elements)
len_list = len(sorted_list)
print(f'The number of residues is: {len_list}\n')
print(sorted_list)
find_combined_sites(ephrin_b2_close,ephrin_b3_close)
find_combined_sites(EFNB2_close,EFNB3_close)
# Function for finding values that are shared between two different lists
def find_overlapping_sites(list1, list2):
# Convert both lists to sets
set1 = set(list1)
set2 = set(list2)
# Find the intersection (common elements) of both sets
overlapping_sites = set1.intersection(set2)
# Convert the resulting set to a sorted list
sorted_list = sorted(overlapping_sites)
# Print the number of overlapping sites and the sites themselves
print(f'The number of overlapping residues is: {len(sorted_list)}\n')
print(sorted_list)
find_overlapping_sites(ephrin_b2_close, ephrin_b3_close)
For 2VSM (RBP/Ephrin-B2), the close contact sites are: [239, 240, 241, 242, 305, 388, 389, 401, 402, 458, 488, 489, 490, 491, 492, 501, 504, 505, 506, 507, 530, 531, 532, 533, 555, 557, 558, 559, 579, 580, 581, 583, 588] For 3D12 (RBP/Ephrin-B3), the close contact sites are: [239, 240, 241, 242, 305, 388, 389, 401, 402, 458, 488, 489, 490, 491, 492, 501, 504, 505, 506, 507, 530, 531, 532, 533, 555, 557, 558, 559, 579, 580, 581, 583, 588] For 7TXZ Chain A, close contact sites are: [160, 162, 163, 164, 165, 166, 169, 171, 205, 206, 208, 247, 249, 256, 258, 259, 260, 261, 266, 267, 268, 270, 323, 324, 325, 331] For 7TXZ Chain B, close contact sites are : [167, 168, 170, 171, 172, 202, 203, 204, 205, 208, 210, 212, 237, 238, 239, 240, 242, 258, 584, 589, 591] For EFNB2, close contact sites are : [57, 58, 60, 97, 101, 102, 103, 106, 107, 108, 109, 111, 112, 113, 114, 116, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 130] For EFNB3, close contact sites are : [55, 57, 101, 102, 106, 107, 108, 109, 112, 113, 114, 116, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 130] The number of residues is: 33 [239, 240, 241, 242, 305, 388, 389, 401, 402, 458, 488, 489, 490, 491, 492, 501, 504, 505, 506, 507, 530, 531, 532, 533, 555, 557, 558, 559, 579, 580, 581, 583, 588] The number of residues is: 29 [55, 57, 58, 60, 97, 101, 102, 103, 106, 107, 108, 109, 111, 112, 113, 114, 116, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 130] The number of overlapping residues is: 33 [239, 240, 241, 242, 305, 388, 389, 401, 402, 458, 488, 489, 490, 491, 492, 501, 504, 505, 506, 507, 530, 531, 532, 533, 555, 557, 558, 559, 579, 580, 581, 583, 588]
In [11]:
# What residues are shared between chain A and B in dimerization face?
find_combined_sites(dimerization,dimerization_assm)
The number of residues is: 43 [160, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 202, 203, 204, 205, 206, 208, 210, 212, 237, 238, 239, 240, 242, 247, 249, 256, 258, 259, 260, 261, 266, 267, 268, 270, 323, 324, 325, 331, 584, 589, 591]
In [12]:
# What residues are overlapping for both asymmetric heads?
find_overlapping_sites(dimerization, dimerization_assm)
The number of overlapping residues is: 4 [171, 205, 208, 258]