Get the distances of residues from Mxra8 in the structure¶
In [1]:
# this cell is tagged parameters for `papermill` parameterization
addtl_site_annotations_csv = "../data/addtl_site_annotations.csv"
dists_csv = "../results/mxra8_distances/mxra8_dists.csv"
chain_defs = {
"6nk7": {
"E1": ["A", "B", "C", "D"],
"E2": ["E", "F", "G", "H"],
"E3": ["U", "V", "W", "X"],
"Mxra8": ["N"],
},
"6nk6": {
"E1": ["A", "B", "C", "D"],
"E2": ["E", "F", "G", "H"],
"Mxra8": ["M", "N", "O", "P"],
}
}
In [2]:
# Parameters
chain_defs = {
"6nk7": {
"E1": ["A", "B", "C", "D"],
"E2": ["E", "F", "G", "H"],
"E3": ["U", "V", "W", "X"],
"Mxra8": ["N"],
},
"6nk6": {
"E1": ["A", "B", "C", "D"],
"E2": ["E", "F", "G", "H"],
"Mxra8": ["M", "N", "O", "P"],
},
}
addtl_site_annotations_csv = "data/addtl_site_annotations.csv"
dists_csv = "results/mxra8_distances/mxra8_dists.csv"
In [3]:
import requests
import tempfile
import altair as alt
import numpy
import pandas as pd
import polyclonal.pdb_utils
Get distances of each chain and site from Mxra8:
In [4]:
records = []
for pdb_id in chain_defs:
pdb_url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
print(f"Getting PDB distances for {pdb_id=} from {pdb_url=}")
r = requests.get(pdb_url)
chain_dists = {}
mxra8_chains = chain_defs[pdb_id]["Mxra8"]
with tempfile.NamedTemporaryFile() as tmpf:
_ = tmpf.write(r.content)
tmpf.flush()
for e_prot in ["E1", "E2", "E3"]:
if e_prot in chain_defs[pdb_id]:
print(f"Getting distances to {e_prot=}")
chain_dists[e_prot] = polyclonal.pdb_utils.inter_residue_distances(
tmpf.name, chain_defs[pdb_id][e_prot] + mxra8_chains
)
for e_prot, df in chain_dists.items():
e_chain_ids = chain_defs[pdb_id][e_prot]
for site_1, site_2, d, chain_1, chain_2 in df[
["site_1", "site_2", "distance", "chain_1", "chain_2"]
].itertuples(index=False):
if chain_2 in mxra8_chains and chain_1 in e_chain_ids:
records.append((pdb_id, e_prot, chain_1, chain_2, site_1, site_2, d))
elif chain_1 in mxra8_chains and chain_2 in e_chain_ids:
records.append((pdb_id, e_prot, chain_2, chain_1, site_2, site_1, d))
chain_dists_to_mxra8 = pd.DataFrame(
records, columns=["PDB", "E", "E_chain", "Mxra8_chain", "E_site", "Mxra8_site", "distance"],
)
Getting PDB distances for pdb_id='6nk7' from pdb_url='https://files.rcsb.org/download/6nk7.pdb'
Getting distances to e_prot='E1'
Getting distances to e_prot='E2'
Getting distances to e_prot='E3'
Getting PDB distances for pdb_id='6nk6' from pdb_url='https://files.rcsb.org/download/6nk6.pdb'
Getting distances to e_prot='E1'
Getting distances to e_prot='E2'
Get closest residues to Mxra8 for each chain in each PDB:
In [5]:
closest_dists = (
chain_dists_to_mxra8
.sort_values("distance")
.groupby(["PDB", "E", "E_site"], as_index=False)
.first()
[["PDB", "E", "E_site", "distance", "E_chain", "Mxra8_chain", "Mxra8_site"]]
.rename(
columns={
"E": "region",
"E_site": "site",
"E_chain": f"E_{pdb_id}_chain",
"Mxra8_chain": f"Mxra8_{pdb_id}_chain",
"distance": "distance_to_Mxra8",
}
)
)
print(f"Writing to {dists_csv=}")
closest_dists.to_csv(dists_csv, index=False, float_format="%.1f")
closest_dists
Writing to dists_csv='results/mxra8_distances/mxra8_dists.csv'
Out[5]:
| PDB | region | site | distance_to_Mxra8 | E_6nk6_chain | Mxra8_6nk6_chain | Mxra8_site | |
|---|---|---|---|---|---|---|---|
| 0 | 6nk6 | E1 | 1 | 19.711683 | A | M | 237 |
| 1 | 6nk6 | E1 | 2 | 18.327227 | B | N | 237 |
| 2 | 6nk6 | E1 | 3 | 18.113394 | B | N | 237 |
| 3 | 6nk6 | E1 | 4 | 19.478868 | D | P | 237 |
| 4 | 6nk6 | E1 | 5 | 18.130732 | D | P | 237 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1640 | 6nk7 | E3 | 60 | 16.058920 | W | N | 188 |
| 1641 | 6nk7 | E3 | 61 | 13.160142 | W | N | 301 |
| 1642 | 6nk7 | E3 | 62 | 9.372482 | W | N | 188 |
| 1643 | 6nk7 | E3 | 63 | 7.213562 | W | N | 188 |
| 1644 | 6nk7 | E3 | 64 | 5.668389 | W | N | 195 |
1645 rows × 7 columns
Add in additional site annotations, which includes contacts as defined by Basore et al30392-7?sf214631088=1):
In [6]:
closest_dists = closest_dists.merge(
(
pd.read_csv(addtl_site_annotations_csv)
.rename(columns={"protein_site": "site", "contacts": "Basore_contact"})
[["region", "site", "domain", "Basore_contact"]]
),
on=["region", "site"],
validate="m:1",
)
Plot the distribution of distances for the different contact types:
In [7]:
numpy.random.seed(1)
site_selection = alt.selection_point(
on="mouseover", empty=False, fields=["region", "site"],
)
(
alt.Chart(
closest_dists.assign(
jitter=lambda x: numpy.random.uniform(-0.5, 0.5, size=len(x))
)
)
.add_params(site_selection)
.encode(
alt.X("distance_to_Mxra8", scale=alt.Scale(domain=[0, 20], clamp=True)),
alt.Y("Basore_contact"),
alt.YOffset("jitter", scale=alt.Scale(domain=[-1, 1])),
alt.Color("region"),
alt.Row("PDB", header=alt.Header(orient="top", labelFontSize=14), title=None),
tooltip=["site", "region", alt.Tooltip("distance_to_Mxra8", format=".1f")],
size=alt.condition(site_selection, alt.value(100), alt.value(35)),
strokeWidth=alt.condition(site_selection, alt.value(4), alt.value(0.5)),
fillOpacity=alt.condition(site_selection, alt.value(1), alt.value(0.5)),
)
.mark_circle(stroke="black")
.properties(height=alt.Step(40))
)
Out[7]:
In [ ]: