import gzip
import os
import requests
import subprocess
import warnings

import Bio.PDB.PDBParser
import Bio.PDB.Polypeptide

import matplotlib.colors

import pandas as pd

import seaborn

# this cell tagged `parameters` so that papermill fills parameter values

# input files
data_csv = None  # CSV w DMS data
sitemap_csv = None  # must have columns 'reference_site', 'sequential_site', 'protein_site', 'chains'

# input parameters
pdb_id = None  # Protein Data Bank ID, eg, 6nkg
pdb_type = None  # {"asymmetric_unit", "biological_assembly"}
name = None  # name assigned to visualization
melt_condition_metric_cols = None  # None, or list of columns to melt to create metric and condition
metric = None  # metric to plot
opt_params = None  # optional params, have same meaning as to `configure-dms-viz`:

# created files
pdb_file = None
dms_viz_json = None
input_data_csv = None
input_sitemap_csv = None

# Parameters
pdb_id = "6nk6"
pdb_type = "asymmetric_unit"
name = "effects of mutations to CHIKV E on binding to Mxra8"
melt_condition_metric_cols = ["binding to mouse Mxra8", "binding to human Mxra8"]
metric = "Mxra8 binding"
opt_params = {"condition": "Mxra8_type", "included-chains": "A B C D E F G H", "excluded-chains": "I J K L", "alphabet": "RKHDEQNSTYWFAILMVGPC", "floor": False, "summary-stat": "sum", "filter-cols": {"entry in 293T_Mxra8 cells": "293T-Mxra8 entry"}, "filter-limits": {"entry in 293T_Mxra8 cells": [-8, -4, 0]}, "tooltip-cols": {"sequential_site": "sequential site", "region": "region", "entry in 293T_Mxra8 cells": "293T-Mxra8 entry"}, "heatmap-limits": "-4, 0, 3"}
data_csv = "results/summaries/binding_mouse_vs_human_Mxra8.csv"
sitemap_csv = "data/pdb_sitemaps/6nk6_sitemap.csv"
dms_viz_json = "results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.json"
pdb_file = "results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.pdb"
input_data_csv = "results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_data.csv"
input_sitemap_csv = "results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_sitemap.csv"

print(f"Reading data from {data_csv=}")

data = pd.read_csv(data_csv)

# cannot have spaces in filter-cols, so rename columns with underscore if needed
if "filter-cols" in opt_params:
    for col, name in list(opt_params["filter-cols"].items()):
        if " " in col:
            new_col = col.replace(" ", "_")
            print(f"{col=} has space, renaming to {new_col}")
            assert new_col not in data.columns, f"{new_col=}, {data.columns=}"
            data[new_col] = data[col]
            opt_params["filter-cols"][new_col] = name
            del opt_params["filter-cols"][col]
            if "filter-limits" in opt_params:
                opt_params["filter-limits"][new_col] = opt_params["filter-limits"][col]
                del opt_params["filter-limits"][col]

Reading data from data_csv='results/summaries/binding_mouse_vs_human_Mxra8.csv'
col='entry in 293T_Mxra8 cells' has space, renaming to entry_in_293T_Mxra8_cells

print(f"Reading the sitemap from {sitemap_csv=}")

sitemap_req_cols = ["reference_site", "sequential_site", "protein_site", "chains"]

sitemap = pd.read_csv(sitemap_csv, dtype={"protein_site": str}).assign(
    chains=lambda x: x["chains"].map(lambda c: [] if pd.isnull(c) else c.split())
)

if not set(sitemap.columns).issubset(sitemap_req_cols):
    raise ValueError(f"{sitemap_csv} has {sitemap.columns}, lacks {sitemap_req_cols=}")
sitemap = sitemap[sitemap_req_cols]

assert len(sitemap) == sitemap["reference_site"].nunique()
assert len(sitemap) == sitemap["sequential_site"].nunique()

print(f"\nComparing `site` in data to `reference_site` in sitemap:")
if sitemap["reference_site"].dtype != data["site"].dtype:
    raise ValueError(f'{sitemap["reference_site"].dtype=}, {data["site"].dtype=}')

nshared = len(set(sitemap["reference_site"]).intersection(data["site"]))
print(f"\n{nshared} sites in both data and sitemap")

sitemap_only = sitemap[~sitemap["reference_site"].isin(data["site"])].reset_index(drop=True)
print(f"\n{len(sitemap_only)} sites in sitemap but not data")
if len(sitemap_only):
    display(sitemap_only)

data_only = (
    data[~data["site"].isin(sitemap["reference_site"])]
    [["site"]]
    .drop_duplicates()
    .reset_index(drop=True)
)
print(f"\n{len(data_only)} sites in data but not sitemap")
if len(data_only):
    display(data_only)

# add wildtype residues from data to sitemap
sitemap = sitemap.merge(
    data[["site", "wildtype"]].drop_duplicates().rename(
        columns={"site": "reference_site"}
    ),
    on="reference_site",
    validate="one_to_one",
    how="left",
)

Reading the sitemap from sitemap_csv='data/pdb_sitemaps/6nk6_sitemap.csv'

Comparing `site` in data to `reference_site` in sitemap:

988 sites in both data and sitemap

0 sites in sitemap but not data

0 sites in data but not sitemap

print(f"Getting PDB file for {pdb_id=} to {pdb_file=} for {pdb_type=}")
os.makedirs(os.path.dirname(pdb_file), exist_ok=True)

if pdb_type == "biological_assembly":
    r = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb1.gz")
    assert r.status_code == 200
    pdb_content = gzip.decompress(r.content).decode("utf-8")
elif pdb_type == "asymmetric_unit":
    r = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb")
    assert r.status_code == 200
    pdb_content = r.content.decode("utf-8")
else:
    raise ValueError(f"invalid {pdb_type=}")

with open(pdb_file, "w") as f:
    f.write(pdb_content)

Getting PDB file for pdb_id='6nk6' to pdb_file='results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.pdb' for pdb_type='asymmetric_unit'

# read in the PDB object
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    pdb_obj = Bio.PDB.PDBParser().get_structure(id=pdb_id, file=pdb_file)[0]

# analyze PDB object in context of other data / params

all_chains = [chain.id for chain in pdb_obj]
print(f"PDB has {all_chains=}")

if "included-chains" in opt_params:
    included_chains = opt_params["included-chains"].split()
    print(f"Included chains for mapping DMS data: {included_chains=}")
    assert set(included_chains).issubset(all_chains)
else:
    included_chains = all_chains
    print("Including all chains for mapping DMS data")

if "excluded-chains" in opt_params:
    if not set(opt_params["excluded-chains"].split()).issubset(all_chains):
        raise ValueError(f"{opt_params['excluded-chains']=} not in {all_chains=}")

sitemap_chains = set().union(*sitemap["chains"])
print(f"The chains specified in the sitemap are {sitemap_chains=}")
if sitemap_chains - set(included_chains):
    print(
        "Removing the following sitemap chains which are not in included-chains:\n"
        + str(sitemap_chains - set(included_chains))
    )
    sitemap = sitemap.assign(
        chains=lambda x: x["chains"].map(
            lambda cs: [c for c in cs if c in included_chains]
        )
    )
elif set(included_chains) - sitemap_chains:
    raise ValueError(f"some {included_chains=} not in {sitemap_chains=}")
sitemap = sitemap.assign(chains=lambda x: x["chains"].map(lambda c: " ".join(c)))

# get wildtype residue at each site
records = []
for chain in included_chains:
    for res in pdb_obj[chain].get_residues():
        if not res.id[0].isspace():
            continue
        aa = Bio.PDB.Polypeptide.protein_letters_3to1[res.resname]
        r = str(res.id[1])
        records.append((chain, r, aa))
pdb_df = pd.DataFrame(records, columns=["chain", "protein_site", "pdb_aa"])
assert pdb_df["protein_site"].dtype == sitemap["protein_site"].dtype

# compare wildtypes in DMS data and actual PDB
for chainset, chainset_df in sitemap.query("chains != ''").groupby("chains", dropna=True):
    chainset = chainset.split()
    print(f"\nComparing wildtype residues in DMS data and PDB for {chainset=}")
    pdb_chainset_df = (
        pdb_df
        .query("chain in @chainset")
        .groupby("protein_site", as_index=False)
        .aggregate(
            pdb_aa=pd.NamedAgg("pdb_aa", lambda s: " ".join(s.unique())),
            n_pdb_aas=pd.NamedAgg("pdb_aa", "nunique"),
        )
    )
    if (pdb_chainset_df["n_pdb_aas"] != 1).any():
        raise ValueError(
            "PDB does not have same amino-acid at all chains in {chainset=}:\n"
            + str(pdb_chainset_df.query("n_pdb_aas != 1"))
        )
    df = (
        chainset_df
        .merge(
            pdb_chainset_df.drop(columns="n_pdb_aas"),
            on="protein_site",
            how="outer",
            validate="one_to_one",
        )
        .assign(
            differ=lambda x: (
                (x["wildtype"] != x["pdb_aa"])
                & x["pdb_aa"].notnull()
                & x["wildtype"].notnull()
            )
        )
        .sort_values("sequential_site")
    )
    assert (df["wildtype"].notnull() | df["pdb_aa"].notnull()).all()
    print(
        "Comparing wildtype amino acids for sites with DMS data versus sites in PDB:\n"
        f"  - sites with same amino acid: {sum(df['wildtype'] == df['pdb_aa'])}\n"
        f"  - sites with different amino acid: {df['differ'].sum()}\n"
        f"  - sites missing in DMS data: {df['wildtype'].isnull().sum()}\n"
        f"  - sites missing in PDB: {df['pdb_aa'].isnull().sum()}"
    )
    if df["differ"].any():
        print("Here are sites with different amino acids between DMS data and PDB:")
        display(df.query("differ").drop(columns="differ").reset_index(drop=True))

PDB has all_chains=['M', 'N', 'O', 'P', 'A', 'E', 'I', 'B', 'F', 'J', 'C', 'G', 'K', 'D', 'H', 'L']
Included chains for mapping DMS data: included_chains=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
The chains specified in the sitemap are sitemap_chains={'H', 'F', 'G', 'B', 'C', 'D', 'A', 'E'}

Comparing wildtype residues in DMS data and PDB for chainset=['A', 'B', 'C', 'D']
Comparing wildtype amino acids for sites with DMS data versus sites in PDB:
  - sites with same amino acid: 423
  - sites with different amino acid: 16
  - sites missing in DMS data: 0
  - sites missing in PDB: 0
Here are sites with different amino acids between DMS data and PDB:

Comparing wildtype residues in DMS data and PDB for chainset=['E', 'F', 'G', 'H']
Comparing wildtype amino acids for sites with DMS data versus sites in PDB:
  - sites with same amino acid: 389
  - sites with different amino acid: 30
  - sites missing in DMS data: 0
  - sites missing in PDB: 4
Here are sites with different amino acids between DMS data and PDB:

# allowed optional params from:
# https://dms-viz.github.io/dms-viz-docs/preparing-data/command-line-api/#configure-dms-viz-format
allowed_params = [
    "condition",
    "metric-name",
    "condition-name",
    "tooltip-cols",
    "filter-cols",
    "filter-limits",
    "heatmap-limits",
    "included-chains",
    "excluded-chains",
    "alphabet",
    "colors",
    "negative-colors",
    "exclude-amino-acids",
    "description",
    "title",
    "floor",
    "summary-stat",
]

if not set(opt_params).issubset(allowed_params):
    raise ValueError(f"invalid `opt_params` {set(opt_params) - set(allowed_params)=}")

# get required columns
data_cols = ["site", "wildtype", "mutant"]
for param in ["tooltip-cols", "filter-cols"]:
    if param in opt_params:
        data_cols += opt_params[param]
if not set(data_cols).issubset(data.columns):
    raise ValueError(f"{data_cols=} not in {data.columns=}")

# melt data if needed
if melt_condition_metric_cols:
    order_map = {c: i for (i, c) in enumerate(melt_condition_metric_cols)}
    if not set(melt_condition_metric_cols).issubset(data.columns):
        raise ValueError(f"{melt_condition_metric_cols} not in {data.columns=}")
    assert melt_condition_metric_cols, f"{melt_condition_metric_cols=}"
    if "condition" not in opt_params:
        raise ValueError(
            f"`melt_condition_metric_cols` requires 'condition' in {opt_params=}"
        )
    data_to_use = (
        data.melt(
            id_vars=data_cols,
            value_vars=melt_condition_metric_cols,
            var_name=opt_params["condition"],
            value_name=metric,
        )
        .sort_values(opt_params["condition"], key=lambda col: col.map(order_map))
    )
else:
    data_to_use = data
data_cols.append(metric)
if "condition" in opt_params:
    data_cols.append(opt_params["condition"])
if not set(data_cols).issubset(data_to_use.columns):
    raise ValueError(f"{data_cols=} not in {data_to_use.columns=}")

# write the sitemap dropping null protein_site columns
print(f"Writing sitemap with null protein sites dropped to {input_sitemap_csv=}")
sitemap.query("protein_site.notnull()").to_csv(input_sitemap_csv, index=False)

# write data to file to use as input, keeping only relevant columns with a protein site
print(f"Writing data for {data_cols=} to {input_data_csv=}")
(
    data_to_use
    [data_to_use["site"].isin(set(sitemap.query("protein_site.notnull()")["reference_site"]))]
    [data_cols]
    .to_csv(input_data_csv, index=False, float_format="%.4g")
)

Writing sitemap with null protein sites dropped to input_sitemap_csv='results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_sitemap.csv'

Writing data for data_cols=['site', 'wildtype', 'mutant', 'sequential_site', 'region', 'entry in 293T_Mxra8 cells', 'entry_in_293T_Mxra8_cells', 'Mxra8 binding', 'Mxra8_type'] to input_data_csv='results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_data.csv'

def get_hex_color_palette(num_colors):
    colors = seaborn.color_palette("hls", num_colors)
    hex_colors = [matplotlib.colors.to_hex(color) for color in colors]
    return hex_colors

if "condition" in opt_params:
    nconditions = data_to_use[opt_params["condition"]].nunique()
else:
    nconditions = 1

opt_params = opt_params.copy()

if "colors" not in opt_params:
    opt_params["colors"] = ",".join(get_hex_color_palette(nconditions))

assert set(opt_params).issubset(allowed_params)

cmds = [
    "configure-dms-viz",
    "format",
    "--name", name,
    "--input", input_data_csv,
    "--metric", metric,
    "--sitemap", input_sitemap_csv,
    "--structure", pdb_file,
    "--output", dms_viz_json,
]
for key, val in opt_params.items():
    cmds += [f"--{key}", str(val)]

print(f"Running the following cmds:\n{' '.join(cmds)}\n")
res = subprocess.run(cmds, capture_output=True, text=True)
res_str = []
for attr in ["returncode", "args", "stdout", "stderr"]:
    res_str.append(f"{attr}:\n{getattr(res, attr)}")
res_str = "\n\n".join(res_str)
if res.returncode != 0:
    raise RuntimeError(f"Command failed:\n\n{res_str}")
else:
    print(f"Command succeeded:\n\n{res_str}")

Running the following cmds:
configure-dms-viz format --name 293T-Mxra8 entry --input results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_data.csv --metric Mxra8 binding --sitemap results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_sitemap.csv --structure results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.pdb --output results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.json --condition Mxra8_type --included-chains A B C D E F G H --excluded-chains I J K L --alphabet RKHDEQNSTYWFAILMVGPC --floor False --summary-stat sum --filter-cols {'entry_in_293T_Mxra8_cells': '293T-Mxra8 entry'} --filter-limits {'entry_in_293T_Mxra8_cells': [-8, -4, 0]} --tooltip-cols {'sequential_site': 'sequential site', 'region': 'region', 'entry in 293T_Mxra8 cells': '293T-Mxra8 entry'} --heatmap-limits -4, 0, 3 --colors #db5f57,#57d3db

Command succeeded:

returncode:
0

args:
['configure-dms-viz', 'format', '--name', '293T-Mxra8 entry', '--input', 'results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_data.csv', '--metric', 'Mxra8 binding', '--sitemap', 'results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_sitemap.csv', '--structure', 'results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.pdb', '--output', 'results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.json', '--condition', 'Mxra8_type', '--included-chains', 'A B C D E F G H', '--excluded-chains', 'I J K L', '--alphabet', 'RKHDEQNSTYWFAILMVGPC', '--floor', 'False', '--summary-stat', 'sum', '--filter-cols', "{'entry_in_293T_Mxra8_cells': '293T-Mxra8 entry'}", '--filter-limits', "{'entry_in_293T_Mxra8_cells': [-8, -4, 0]}", '--tooltip-cols', "{'sequential_site': 'sequential site', 'region': 'region', 'entry in 293T_Mxra8 cells': '293T-Mxra8 entry'}", '--heatmap-limits', '-4, 0, 3', '--colors', '#db5f57,#57d3db']

stdout:

Formatting data for visualization using the 'Mxra8 binding' column from 'results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_data.csv'...

Using sitemap from 'results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_sitemap.csv'.

Warning: NaN values were found in the metric column. These rows will be filtered out.

Warning: There are sites where there are no mutations, in other words, only the wildtype residue is present in the mutation column for that site. These rows will be filtered out.
Three values were provided for heatmap limits, these will be the min, center, and max of the scale.

Warning: The 'entry_in_293T_Mxra8_cells' filter limit '-8' is less than the minimum value of -7.712. Setting the min value to -7.712.

About 94.56% (799 of 845) of the wildtype residues in the data match the corresponding residues in the structure.
About 0.47% (4 of 849) of the data sites are missing from the structure.

Success! The visualization JSON was written to 'results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.json'


stderr:

	reference_site	sequential_site	protein_site	chains	wildtype	pdb_aa
0	34(E1)	583	34	A B C D	L	Q
1	98(E1)	647	98	A B C D	T	A
2	142(E1)	691	142	A B C D	V	I
3	145(E1)	694	145	A B C D	S	A
4	162(E1)	711	162	A B C D	I	V
5	211(E1)	760	211	A B C D	E	K
6	225(E1)	774	225	A B C D	S	A
7	269(E1)	818	269	A B C D	M	V
8	276(E1)	825	276	A B C D	M	I
9	296(E1)	845	296	A B C D	L	V
10	321(E1)	870	321	A B C D	A	T
11	343(E1)	892	343	A B C D	E	D
12	344(E1)	893	344	A B C D	I	V
13	379(E1)	928	379	A B C D	E	A
14	404(E1)	953	404	A B C D	V	T
15	420(E1)	969	420	A B C D	V	I

	reference_site	sequential_site	protein_site	chains	wildtype	pdb_aa
0	12(E2)	77	12	E F G H	I	T
1	32(E2)	97	32	E F G H	V	I
2	72(E2)	137	72	E F G H	N	S
3	74(E2)	139	74	E F G H	M	T
4	82(E2)	147	82	E F G H	R	G
5	84(E2)	149	84	E F G H	F	L
6	118(E2)	183	118	E F G H	G	S
7	124(E2)	189	124	E F G H	S	T
8	132(E2)	197	132	E F G H	D	E
9	140(E2)	205	140	E F G H	K	R
10	149(E2)	214	149	E F G H	R	K
11	157(E2)	222	157	E F G H	A	V
12	182(E2)	247	182	E F G H	S	T
13	194(E2)	259	194	E F G H	S	G
14	205(E2)	270	205	E F G H	D	G
15	222(E2)	287	222	E F G H	V	I
16	234(E2)	299	234	E F G H	K	N
17	255(E2)	320	255	E F G H	V	I
18	284(E2)	349	284	E F G H	I	T
19	302(E2)	367	302	E F G H	E	Q
20	307(E2)	372	307	E F G H	Q	H
21	317(E2)	382	317	E F G H	I	V
22	318(E2)	383	318	E F G H	R	T
23	342(E2)	407	342	E F G H	L	M
24	370(E2)	435	370	E F G H	V	I
25	384(E2)	449	384	E F G H	V	T
26	390(E2)	455	390	E F G H	M	V
27	415(E2)	480	415	E F G H	I	L
28	418(E2)	483	418	E F G H	I	V
29	421(E2)	486	421	E F G H	A	T

Configure for `dms-viz`¶

Get the parameters for configuration¶

Read the data and site numbering¶

Get the PDB file and check against DMS data¶

Run `configure-dms-viz format`¶

Configure for dms-viz¶

Get the parameters for configuration¶

Read the data and site numbering¶

Get the PDB file and check against DMS data¶

Run configure-dms-viz format¶

Configure for `dms-viz`¶

Run `configure-dms-viz format`¶