Get the distances of residues from Mxra8 in the structure¶

In [1]:
# this cell is tagged parameters for `papermill` parameterization
addtl_site_annotations_csv = "../data/addtl_site_annotations.csv"
dists_csv = "../results/mxra8_distances/mxra8_dists.csv"

chain_defs = {
    "6nk7": {
        "E1": ["A", "B", "C", "D"],
        "E2": ["E", "F", "G", "H"],
        "E3": ["U", "V", "W", "X"],
        "Mxra8": ["N"],
    },
    "6nk6": {
        "E1": ["A", "B", "C", "D"],
        "E2": ["E", "F", "G", "H"],
        "Mxra8": ["M", "N", "O", "P"],
    }
}
In [2]:
# Parameters
chain_defs = {
    "6nk7": {
        "E1": ["A", "B", "C", "D"],
        "E2": ["E", "F", "G", "H"],
        "E3": ["U", "V", "W", "X"],
        "Mxra8": ["N"],
    },
    "6nk6": {
        "E1": ["A", "B", "C", "D"],
        "E2": ["E", "F", "G", "H"],
        "Mxra8": ["M", "N", "O", "P"],
    },
}
addtl_site_annotations_csv = "data/addtl_site_annotations.csv"
dists_csv = "results/mxra8_distances/mxra8_dists.csv"
In [3]:
import requests
import tempfile

import altair as alt

import numpy

import pandas as pd

import polyclonal.pdb_utils

Get distances of each chain and site from Mxra8:

In [4]:
records = []

for pdb_id in chain_defs:

    pdb_url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    
    print(f"Getting PDB distances for {pdb_id=} from {pdb_url=}")
    r = requests.get(pdb_url)
    
    chain_dists = {}
    mxra8_chains = chain_defs[pdb_id]["Mxra8"]
    with tempfile.NamedTemporaryFile() as tmpf:
        _ = tmpf.write(r.content)
        tmpf.flush()
        for e_prot in ["E1", "E2", "E3"]:
            if e_prot in chain_defs[pdb_id]:
                print(f"Getting distances to {e_prot=}")
                chain_dists[e_prot] = polyclonal.pdb_utils.inter_residue_distances(
                        tmpf.name, chain_defs[pdb_id][e_prot] + mxra8_chains
                )
                
    for e_prot, df in chain_dists.items():
        e_chain_ids = chain_defs[pdb_id][e_prot]
        for site_1, site_2, d, chain_1, chain_2 in df[
            ["site_1", "site_2", "distance", "chain_1", "chain_2"]
        ].itertuples(index=False):
            if chain_2 in mxra8_chains and chain_1 in e_chain_ids:
                records.append((pdb_id, e_prot, chain_1, chain_2, site_1, site_2, d))
            elif chain_1 in mxra8_chains and chain_2 in e_chain_ids:
                records.append((pdb_id, e_prot, chain_2, chain_1, site_2, site_1, d))

chain_dists_to_mxra8 = pd.DataFrame(
    records, columns=["PDB", "E", "E_chain", "Mxra8_chain", "E_site", "Mxra8_site", "distance"],
)
Getting PDB distances for pdb_id='6nk7' from pdb_url='https://files.rcsb.org/download/6nk7.pdb'
Getting distances to e_prot='E1'
Getting distances to e_prot='E2'
Getting distances to e_prot='E3'
Getting PDB distances for pdb_id='6nk6' from pdb_url='https://files.rcsb.org/download/6nk6.pdb'
Getting distances to e_prot='E1'
Getting distances to e_prot='E2'

Get closest residues to Mxra8 for each chain in each PDB:

In [5]:
closest_dists = (
    chain_dists_to_mxra8
    .sort_values("distance")
    .groupby(["PDB", "E", "E_site"], as_index=False)
    .first()
    [["PDB", "E", "E_site", "distance", "E_chain", "Mxra8_chain", "Mxra8_site"]]
    .rename(
        columns={
            "E": "region",
            "E_site": "site",
            "E_chain": f"E_{pdb_id}_chain",
            "Mxra8_chain": f"Mxra8_{pdb_id}_chain",
            "distance": "distance_to_Mxra8",
        }
    )
)

print(f"Writing to {dists_csv=}")
closest_dists.to_csv(dists_csv, index=False, float_format="%.1f")

closest_dists
Writing to dists_csv='results/mxra8_distances/mxra8_dists.csv'
Out[5]:
PDB region site distance_to_Mxra8 E_6nk6_chain Mxra8_6nk6_chain Mxra8_site
0 6nk6 E1 1 19.711683 A M 237
1 6nk6 E1 2 18.327227 B N 237
2 6nk6 E1 3 18.113394 B N 237
3 6nk6 E1 4 19.478868 D P 237
4 6nk6 E1 5 18.130732 D P 237
... ... ... ... ... ... ... ...
1640 6nk7 E3 60 16.058920 W N 188
1641 6nk7 E3 61 13.160142 W N 301
1642 6nk7 E3 62 9.372482 W N 188
1643 6nk7 E3 63 7.213562 W N 188
1644 6nk7 E3 64 5.668389 W N 195

1645 rows × 7 columns

Add in additional site annotations, which includes contacts as defined by Basore et al30392-7?sf214631088=1):

In [6]:
closest_dists = closest_dists.merge(
    (
        pd.read_csv(addtl_site_annotations_csv)
        .rename(columns={"protein_site": "site", "contacts": "Basore_contact"})
        [["region", "site", "domain", "Basore_contact"]]
    ),
    on=["region", "site"],
    validate="m:1",
)

Plot the distribution of distances for the different contact types:

In [7]:
numpy.random.seed(1)

site_selection = alt.selection_point(
    on="mouseover", empty=False, fields=["region", "site"],
)

(
    alt.Chart(
        closest_dists.assign(
            jitter=lambda x: numpy.random.uniform(-0.5, 0.5, size=len(x))
        )
    )
    .add_params(site_selection)
    .encode(
        alt.X("distance_to_Mxra8", scale=alt.Scale(domain=[0, 20], clamp=True)),
        alt.Y("Basore_contact"),
        alt.YOffset("jitter", scale=alt.Scale(domain=[-1, 1])),
        alt.Color("region"),
        alt.Row("PDB", header=alt.Header(orient="top", labelFontSize=14), title=None),
        tooltip=["site", "region", alt.Tooltip("distance_to_Mxra8", format=".1f")],
        size=alt.condition(site_selection, alt.value(100), alt.value(35)),
        strokeWidth=alt.condition(site_selection, alt.value(4), alt.value(0.5)),
        fillOpacity=alt.condition(site_selection, alt.value(1), alt.value(0.5)),
    )
    .mark_circle(stroke="black")
    .properties(height=alt.Step(40))
)
Out[7]:
In [ ]: