# this cell is tagged as parameters for `papermill` parameterization
e2_pdb = None
e3_pdb = None
rbp_pdb = None

ephrin_b2_close_residues = None
ephrin_b3_close_residues = None

ephrin_b2_distance = None
ephrin_b3_distance = None
dimerization_distance = None

# Parameters
e2_pdb = "data/custom_analyses_data/crystal_structures/2vsm.pdb"
e3_pdb = "data/custom_analyses_data/crystal_structures/3d12.pdb"
rbp_pdb = "data/custom_analyses_data/crystal_structures/7txz.pdb"
ephrin_b2_close_residues = "results/distances/2vsm_close_residues.csv"
ephrin_b3_close_residues = "results/distances/3d12_close_residues.csv"
ephrin_b2_distance = "results/distances/2vsm_distances.csv"
ephrin_b3_distance = "results/distances/3d12_distances.csv"
dimerization_distance = "results/distances/7txz_distances.csv"

import math
import os

import altair as alt

import numpy as np

import pandas as pd

import scipy.stats

import Bio.SeqIO

from Bio import AlignIO
from Bio import PDB
from Bio.Align import PairwiseAligner
from Bio.PDB.DSSP import DSSP
from Bio.PDB import PDBParser
import xml.etree.ElementTree as ET
# pd.set_option('display.max_rows', None)

# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

if (
    os.getcwd()
    == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
):
    pass
    print("Already in correct directory")
else:
    os.chdir(
        "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
    )
    print("Setup in correct directory")

Setup in correct directory

if ephrin_b2_close_residues is None:
    e2_pdb = "data/custom_analyses_data/crystal_structures/2vsm.pdb"
    e3_pdb = "data/custom_analyses_data/crystal_structures/3d12.pdb"
    rbp_pdb = "data/custom_analyses_data/crystal_structures/7txz.pdb"

    ephrin_b2_close_residues = "results/distances/2vsm_close_residues.csv"
    ephrin_b3_close_residues = "results/distances/3d12_close_residues.csv"

    ephrin_b2_distance = "results/distances/2vsm_distances.csv"
    ephrin_b3_distance = "results/distances/3d12_distances.csv"
    dimerization_distance = "results/distances/7txz_distances.csv"

# check whether output directory exists
def create_directory(directory_path):
    # Check if the directory exists
    if not os.path.exists(directory_path):
        # Create the directory
        os.makedirs(directory_path)
        print(f"Directory '{directory_path}' created.")
    else:
        print(f"Directory '{directory_path}' already exists.")


# Example usage
directory_path = "results/distances/"
create_directory(directory_path)

Directory 'results/distances/' already exists.

cutoff_distance = 4

def calculate_nearby_residues(
    pdb_path, source_chain_id, target_chain_ids, name, cutoff_distance
):
    # Initialize the PDB parser and load the structure from the given pdb_path
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure("structure_id", pdb_path)

    # Retrieve the source chain and target chains from the structure
    source_chain = structure[0][source_chain_id]
    target_chains = [structure[0][chain_id] for chain_id in target_chain_ids]

    data = []

    # Loop through each residue in the source chain
    for residueA in source_chain:
        # Skip water and common non-protein residues
        if residueA.resname in ["HOH", "WAT", "IPA", "NAG", "SO4"]:
            continue

        nearby_residues = []

        # Compare against residues in each target chain
        for target_chain in target_chains:
            for residueB in target_chain:
                # Skip water and common non-protein residues in target chains as well
                if residueB.resname in ["HOH", "WAT", "IPA", "SO4"]:
                    continue

                is_within_cutoff = False
                # Calculate distance between atoms in the two residues
                for atomA in residueA:
                    for atomB in residueB:
                        distance = atomA - atomB
                        if distance < cutoff_distance:
                            is_within_cutoff = True
                            break
                    if is_within_cutoff:
                        break

                # If within cutoff, add to nearby residues list
                if is_within_cutoff:
                    nearby_residues.append(
                        {
                            "chain": target_chain.get_id(),
                            "residue_id": residueB.id[1],
                            "residue_name": residueB.resname,
                            "distance": distance,
                        }
                    )

        # Append collected data for each residue in source chain
        data.append(
            {
                "wildtype": residueA.resname,
                "site": residueA.id[1],
                "nearby_residues": nearby_residues,
                "custom_source": name,
            }
        )

    # Convert collected data into a pandas DataFrame for easier manipulation
    df = pd.DataFrame(data)
    return df


# Example usage of the function with specified parameters
pdb_path_2VSM = e2_pdb
source_chain_2VSM = "A"
target_chains_2VSM = ["B"]

pdb_path_3D12 = e3_pdb
source_chain_3D12 = "A"
target_chains_3D12 = "B"

# Calculate nearby residues and add custom columns to summarize data
df_2VSM_close = calculate_nearby_residues(
    pdb_path_2VSM, source_chain_2VSM, target_chains_2VSM, "2VSM_source", cutoff_distance
)
df_2VSM_close["number_of_contact_residues_within_5"] = df_2VSM_close[
    "nearby_residues"
].apply(len)
df_2VSM_close["close_residues"] = df_2VSM_close["nearby_residues"].apply(
    lambda x: ", ".join([str(item["residue_id"]) for item in x]) if x else None
)

df_3D12_close = calculate_nearby_residues(
    pdb_path_3D12, source_chain_3D12, target_chains_3D12, "3D12_source", cutoff_distance
)
df_3D12_close[f"number_of_contact_residues_within_{cutoff_distance}"] = df_3D12_close[
    "nearby_residues"
].apply(len)
df_3D12_close["close_residues"] = df_3D12_close["nearby_residues"].apply(
    lambda x: ", ".join([str(item["residue_id"]) for item in x]) if x else None
)


# Adjust numbering for specific residues based on criteria
def adjust_residues(residue_str):
    if residue_str is None:
        return None

    residues = [int(residue_id) for residue_id in residue_str.split(", ")]

    adjusted_residues = [
        residue_id - 3 if residue_id < 68 else residue_id for residue_id in residues
    ]

    return ", ".join(map(str, adjusted_residues))


df_2VSM_close["close_residues"] = df_2VSM_close["close_residues"].apply(adjust_residues)
df_2VSM_close.to_csv(ephrin_b2_close_residues, index=False)
df_3D12_close.to_csv(ephrin_b3_close_residues, index=False)

def calculate_min_distances(pdb_path, source_chain_id, target_chain_ids, name):
    """
    Calculate the minimum distance between residues in a source chain and residues in target chains.

    Args:
        pdb_path (str): Path to the PDB file.
        source_chain_id (str): ID of the source chain.
        target_chain_ids (list): List of IDs for the target chains.
        name (str): A custom name for the data source.

    Returns:
        pandas.DataFrame: A DataFrame containing the minimum distances and related information.
    """
    # Initialize the PDB parser and load the structure from the given pdb_path
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure("structure_id", pdb_path)

    # Retrieve the source chain and target chains from the structure
    source_chain = structure[0][source_chain_id]
    target_chains = [structure[0][chain_id] for chain_id in target_chain_ids]

    data = []

    # Loop through each residue in the source chain
    for residueA in source_chain:
        # Skip water and common non-protein residues
        if residueA.resname in ["HOH", "WAT", "IPA", "NAG", "SO4"]:
            continue

        # Initialize variables to track the closest residue and its distance
        min_distance = float("inf")
        closest_residueB = None
        closest_chain_id = None
        residues_within_4 = 0  # Count of residues within 4 angstroms

        # Compare against residues in each target chain
        for target_chain in target_chains:
            for residueB in target_chain:
                if residueB.resname in ["HOH", "WAT", "IPA", "SO4"]:
                    continue

                # Check for minimum distance and residues within 4 angstroms
                is_within_4 = False
                for atomA in residueA:
                    for atomB in residueB:
                        distance = atomA - atomB
                        if distance < min_distance:
                            min_distance = distance
                            closest_residueB = residueB
                            closest_chain_id = target_chain.get_id()
                        if distance < 4:
                            is_within_4 = True
                if is_within_4:
                    residues_within_4 += 1

        # Append collected data for each residue in source chain
        data.append(
            {
                "wildtype": residueA.resname,
                "site": residueA.id[1],
                "chain": closest_chain_id,
                "residue": closest_residueB.id[1],
                "residue_name": closest_residueB.resname,
                "distance": min_distance,
                "residues_within_4": residues_within_4,
                "custom_source": name,
            }
        )

    # Convert collected data into a pandas DataFrame for easier manipulation
    df = pd.DataFrame(data)
    return df


# Usage examples with specified parameters for different PDB structures
pdb_path_2VSM = e2_pdb
source_chain_2VSM = "A"
target_chains_2VSM = "B"

pdb_path_3D12 = e3_pdb
source_chain_3D12 = "A"
target_chains_3D12 = "B"

pdb_path_7txz = rbp_pdb
source_chain_7txz = "B"
target_chains_7txz = "A"

#pdb_path_7txz = rbp_pdb
source_chain_7txz_A = "A"
target_chains_7txz_B = "B"

# Dictionary to convert three-letter amino acid codes to one-letter
three_to_one_letter = {
    "ALA": "A",
    "ARG": "R",
    "ASN": "N",
    "ASP": "D",
    "CYS": "C",
    "GLU": "E",
    "GLN": "Q",
    "GLY": "G",
    "HIS": "H",
    "ILE": "I",
    "LEU": "L",
    "LYS": "K",
    "MET": "M",
    "PHE": "F",
    "PRO": "P",
    "SER": "S",
    "THR": "T",
    "TRP": "W",
    "TYR": "Y",
    "VAL": "V",
}

#Calculate minimum distances and modify dataframes with additional information
df_2VSM = calculate_min_distances(
    pdb_path_2VSM, source_chain_2VSM, target_chains_2VSM, "E2"
)
df_2VSM["E2_PDB_residue"] = df_2VSM["residue_name"].replace(three_to_one_letter)
# Adjust residue numbering for specific cases
df_2VSM["residue"] = np.where(
    df_2VSM["residue"] < 68, df_2VSM["residue"] - 3, df_2VSM["residue"]
)

df_3D12 = calculate_min_distances(
    pdb_path_3D12, source_chain_3D12, target_chains_3D12, "E3"
)
df_3D12["E3_PDB_residue"] = df_3D12["residue_name"].replace(three_to_one_letter)

df_7txz = calculate_min_distances(
    pdb_path_7txz, 'A', 'B', "dimerization"
)
df_7txz["dimerization_PDB_residue"] = df_7txz["residue_name"].replace(
    three_to_one_letter
)

df_7txz_assm = calculate_min_distances(
    pdb_path_7txz, 'B', 'A', "dimerization"
)
df_7txz_assm["dimerization_PDB_residue"] = df_7txz_assm["residue_name"].replace(
    three_to_one_letter
)

EFNB2_dist = calculate_min_distances(
    pdb_path_2VSM, 'B','A','EFNB2'
)
#EFNB2_dist["site"] = np.where(
#    EFNB2_dist["site"] < 68, EFNB2_dist["site"] - 3, EFNB2_dist["site"]
#)

EFNB3_dist = calculate_min_distances(
    pdb_path_3D12, 'B','A','EFNB3'
)

print("All done!")

# Save the processed data to CSV files
df_2VSM.to_csv(ephrin_b2_distance, index=False)
df_3D12.to_csv(ephrin_b3_distance, index=False)
df_7txz.to_csv(dimerization_distance, index=False)

All done!

def find_close_residues(df, distance_cutoff):
    close = df[df["distance"] <= distance_cutoff]
    unique_sites = list(close["site"].unique())
    return unique_sites

ephrin_b2_close = find_close_residues(df_2VSM, 4)
print(f'For 2VSM (RBP/Ephrin-B2), the close contact sites are: {ephrin_b2_close}\n')

ephrin_b3_close = find_close_residues(df_3D12, 4)
print(f'For 3D12 (RBP/Ephrin-B3), the close contact sites are: {ephrin_b3_close}\n')

dimerization = find_close_residues(df_7txz, 4)
print(f' For 7TXZ Chain A, close contact sites are: {dimerization}')

dimerization_assm = find_close_residues(df_7txz_assm,4)
print(f' For 7TXZ Chain B, close contact sites are : {dimerization_assm}')

EFNB2_close = find_close_residues(EFNB2_dist,4)
print(f' For EFNB2, close contact sites are : {EFNB2_close}')

EFNB3_close = find_close_residues(EFNB3_dist,4)
print(f' For EFNB3, close contact sites are : {EFNB3_close}')

# Function for finding the combined values for two different lists
def find_combined_sites(list1, list2):
    combined_list = list(list1) + list(list2)
    unique_elements = set(combined_list)
    sorted_list = sorted(unique_elements)
    len_list = len(sorted_list)
    print(f'The number of residues is: {len_list}\n')
    print(sorted_list)

find_combined_sites(ephrin_b2_close,ephrin_b3_close)
find_combined_sites(EFNB2_close,EFNB3_close)

# Function for finding values that are shared between two different lists
def find_overlapping_sites(list1, list2):
    # Convert both lists to sets
    set1 = set(list1)
    set2 = set(list2)
    # Find the intersection (common elements) of both sets
    overlapping_sites = set1.intersection(set2)
    # Convert the resulting set to a sorted list
    sorted_list = sorted(overlapping_sites)
    # Print the number of overlapping sites and the sites themselves
    print(f'The number of overlapping residues is: {len(sorted_list)}\n')
    print(sorted_list)

find_overlapping_sites(ephrin_b2_close, ephrin_b3_close)

For 2VSM (RBP/Ephrin-B2), the close contact sites are: [239, 240, 241, 242, 305, 388, 389, 401, 402, 458, 488, 489, 490, 491, 492, 501, 504, 505, 506, 507, 530, 531, 532, 533, 555, 557, 558, 559, 579, 580, 581, 583, 588]

For 3D12 (RBP/Ephrin-B3), the close contact sites are: [239, 240, 241, 242, 305, 388, 389, 401, 402, 458, 488, 489, 490, 491, 492, 501, 504, 505, 506, 507, 530, 531, 532, 533, 555, 557, 558, 559, 579, 580, 581, 583, 588]

 For 7TXZ Chain A, close contact sites are: [160, 162, 163, 164, 165, 166, 169, 171, 205, 206, 208, 247, 249, 256, 258, 259, 260, 261, 266, 267, 268, 270, 323, 324, 325, 331]
 For 7TXZ Chain B, close contact sites are : [167, 168, 170, 171, 172, 202, 203, 204, 205, 208, 210, 212, 237, 238, 239, 240, 242, 258, 584, 589, 591]
 For EFNB2, close contact sites are : [57, 58, 60, 97, 101, 102, 103, 106, 107, 108, 109, 111, 112, 113, 114, 116, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 130]
 For EFNB3, close contact sites are : [55, 57, 101, 102, 106, 107, 108, 109, 112, 113, 114, 116, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 130]
The number of residues is: 33

[239, 240, 241, 242, 305, 388, 389, 401, 402, 458, 488, 489, 490, 491, 492, 501, 504, 505, 506, 507, 530, 531, 532, 533, 555, 557, 558, 559, 579, 580, 581, 583, 588]
The number of residues is: 29

[55, 57, 58, 60, 97, 101, 102, 103, 106, 107, 108, 109, 111, 112, 113, 114, 116, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 130]
The number of overlapping residues is: 33

[239, 240, 241, 242, 305, 388, 389, 401, 402, 458, 488, 489, 490, 491, 492, 501, 504, 505, 506, 507, 530, 531, 532, 533, 555, 557, 558, 559, 579, 580, 581, 583, 588]

# What residues are shared between chain A and B in dimerization face?
find_combined_sites(dimerization,dimerization_assm)

The number of residues is: 43

[160, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 202, 203, 204, 205, 206, 208, 210, 212, 237, 238, 239, 240, 242, 247, 249, 256, 258, 259, 260, 261, 266, 267, 268, 270, 323, 324, 325, 331, 584, 589, 591]

# What residues are overlapping for both asymmetric heads?
find_overlapping_sites(dimerization, dimerization_assm)

The number of overlapping residues is: 4

[171, 205, 208, 258]

receptor_distance¶

For running interactively¶

Check whether data already exists¶

Calculate amino acid distances of RBP to Ephrins¶

First calculate how many receptor residues are within cutoff distance¶

Which residues are close to receptor?¶