Configure for dms-viz¶

This notebook runs configure-dms-viz to configure a JSON for a structure / deep mutational scanning dataset.

In [1]:
import gzip
import os
import requests
import subprocess
import warnings

import Bio.PDB.PDBParser
import Bio.PDB.Polypeptide

import matplotlib.colors

import pandas as pd

import seaborn

Get the parameters for configuration¶

This notebook is parameterized by papermill, so the next cell is tagged paramters for papermill parameterization. After parameterization, the cell will be followed by one that defines the None variables, as for example:

# Parameters
pdb_id = "6nk6"
pdb_type = "asymmetric_unit"
name = "CHIKV E mutation effects on cell entry"
melt_condition_metric_cols = ["entry in 293T_Mxra8 cells", "entry in C636 cells", "entry in 293T_TIM1 cells"]
metric = "entry"
opt_params = {"condition": "cells", "included-chains": "A B C D E F G H", "excluded-chains": "I J K L", "alphabet": "RKHDEQNSTYWFAILMVGPC", "floor": False, "summary-stat": "mean", "tooltip-cols": {"sequential_site": "sequential site", "region": "region"}, "heatmap-limits": "-5, 0, 2"}
data_csv = "results/summaries/entry_293T-Mxra8_C636_293T-TIM1_Mxra8-binding.csv"
sitemap_csv = "data/pdb_sitemaps/6nk6_sitemap.csv"
dms_viz_json = "results/dms-viz/vlp_w_mMxra8_6nk6/vlp_w_mMxra8_6nk6.json"
pdb_file = "results/dms-viz/vlp_w_mMxra8_6nk6/vlp_w_mMxra8_6nk6.pdb"
input_data_csv = "results/dms-viz/vlp_w_mMxra8_6nk6/vlp_w_mMxra8_6nk6_data.csv"
input_sitemap_csv = "results/dms-viz/vlp_w_mMxra8_6nk6/vlp_w_mMxra8_6nk6_sitemap.csv"

Immediately below is the cell tagged parameters:

In [2]:
# this cell tagged `parameters` so that papermill fills parameter values

# input files
data_csv = None  # CSV w DMS data
sitemap_csv = None  # must have columns 'reference_site', 'sequential_site', 'protein_site', 'chains'

# input parameters
pdb_id = None  # Protein Data Bank ID, eg, 6nkg
pdb_type = None  # {"asymmetric_unit", "biological_assembly"}
name = None  # name assigned to visualization
melt_condition_metric_cols = None  # None, or list of columns to melt to create metric and condition
metric = None  # metric to plot
opt_params = None  # optional params, have same meaning as to `configure-dms-viz`:

# created files
pdb_file = None
dms_viz_json = None
input_data_csv = None
input_sitemap_csv = None
In [3]:
# Parameters
pdb_id = "6nk6"
pdb_type = "asymmetric_unit"
name = "effects of mutations to CHIKV E on binding to Mxra8"
melt_condition_metric_cols = ["binding to mouse Mxra8", "binding to human Mxra8"]
metric = "Mxra8 binding"
opt_params = {"condition": "Mxra8_type", "included-chains": "A B C D E F G H", "excluded-chains": "I J K L", "alphabet": "RKHDEQNSTYWFAILMVGPC", "floor": False, "summary-stat": "sum", "filter-cols": {"entry in 293T_Mxra8 cells": "293T-Mxra8 entry"}, "filter-limits": {"entry in 293T_Mxra8 cells": [-8, -4, 0]}, "tooltip-cols": {"sequential_site": "sequential site", "region": "region", "entry in 293T_Mxra8 cells": "293T-Mxra8 entry"}, "heatmap-limits": "-4, 0, 3"}
data_csv = "results/summaries/binding_mouse_vs_human_Mxra8.csv"
sitemap_csv = "data/pdb_sitemaps/6nk6_sitemap.csv"
dms_viz_json = "results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.json"
pdb_file = "results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.pdb"
input_data_csv = "results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_data.csv"
input_sitemap_csv = "results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_sitemap.csv"

Read the data and site numbering¶

In [4]:
print(f"Reading data from {data_csv=}")

data = pd.read_csv(data_csv)

# cannot have spaces in filter-cols, so rename columns with underscore if needed
if "filter-cols" in opt_params:
    for col, name in list(opt_params["filter-cols"].items()):
        if " " in col:
            new_col = col.replace(" ", "_")
            print(f"{col=} has space, renaming to {new_col}")
            assert new_col not in data.columns, f"{new_col=}, {data.columns=}"
            data[new_col] = data[col]
            opt_params["filter-cols"][new_col] = name
            del opt_params["filter-cols"][col]
            if "filter-limits" in opt_params:
                opt_params["filter-limits"][new_col] = opt_params["filter-limits"][col]
                del opt_params["filter-limits"][col]
Reading data from data_csv='results/summaries/binding_mouse_vs_human_Mxra8.csv'
col='entry in 293T_Mxra8 cells' has space, renaming to entry_in_293T_Mxra8_cells
In [5]:
print(f"Reading the sitemap from {sitemap_csv=}")

sitemap_req_cols = ["reference_site", "sequential_site", "protein_site", "chains"]

sitemap = pd.read_csv(sitemap_csv, dtype={"protein_site": str}).assign(
    chains=lambda x: x["chains"].map(lambda c: [] if pd.isnull(c) else c.split())
)

if not set(sitemap.columns).issubset(sitemap_req_cols):
    raise ValueError(f"{sitemap_csv} has {sitemap.columns}, lacks {sitemap_req_cols=}")
sitemap = sitemap[sitemap_req_cols]

assert len(sitemap) == sitemap["reference_site"].nunique()
assert len(sitemap) == sitemap["sequential_site"].nunique()

print(f"\nComparing `site` in data to `reference_site` in sitemap:")
if sitemap["reference_site"].dtype != data["site"].dtype:
    raise ValueError(f'{sitemap["reference_site"].dtype=}, {data["site"].dtype=}')

nshared = len(set(sitemap["reference_site"]).intersection(data["site"]))
print(f"\n{nshared} sites in both data and sitemap")

sitemap_only = sitemap[~sitemap["reference_site"].isin(data["site"])].reset_index(drop=True)
print(f"\n{len(sitemap_only)} sites in sitemap but not data")
if len(sitemap_only):
    display(sitemap_only)

data_only = (
    data[~data["site"].isin(sitemap["reference_site"])]
    [["site"]]
    .drop_duplicates()
    .reset_index(drop=True)
)
print(f"\n{len(data_only)} sites in data but not sitemap")
if len(data_only):
    display(data_only)

# add wildtype residues from data to sitemap
sitemap = sitemap.merge(
    data[["site", "wildtype"]].drop_duplicates().rename(
        columns={"site": "reference_site"}
    ),
    on="reference_site",
    validate="one_to_one",
    how="left",
)
Reading the sitemap from sitemap_csv='data/pdb_sitemaps/6nk6_sitemap.csv'

Comparing `site` in data to `reference_site` in sitemap:

988 sites in both data and sitemap

0 sites in sitemap but not data

0 sites in data but not sitemap

Get the PDB file and check against DMS data¶

In [6]:
print(f"Getting PDB file for {pdb_id=} to {pdb_file=} for {pdb_type=}")
os.makedirs(os.path.dirname(pdb_file), exist_ok=True)

if pdb_type == "biological_assembly":
    r = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb1.gz")
    assert r.status_code == 200
    pdb_content = gzip.decompress(r.content).decode("utf-8")
elif pdb_type == "asymmetric_unit":
    r = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb")
    assert r.status_code == 200
    pdb_content = r.content.decode("utf-8")
else:
    raise ValueError(f"invalid {pdb_type=}")

with open(pdb_file, "w") as f:
    f.write(pdb_content)
Getting PDB file for pdb_id='6nk6' to pdb_file='results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.pdb' for pdb_type='asymmetric_unit'

Check the sites mismatched between the sitemap and the protein structure in terms of residue identity:

In [7]:
# read in the PDB object
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    pdb_obj = Bio.PDB.PDBParser().get_structure(id=pdb_id, file=pdb_file)[0]
In [8]:
# analyze PDB object in context of other data / params

all_chains = [chain.id for chain in pdb_obj]
print(f"PDB has {all_chains=}")

if "included-chains" in opt_params:
    included_chains = opt_params["included-chains"].split()
    print(f"Included chains for mapping DMS data: {included_chains=}")
    assert set(included_chains).issubset(all_chains)
else:
    included_chains = all_chains
    print("Including all chains for mapping DMS data")

if "excluded-chains" in opt_params:
    if not set(opt_params["excluded-chains"].split()).issubset(all_chains):
        raise ValueError(f"{opt_params['excluded-chains']=} not in {all_chains=}")

sitemap_chains = set().union(*sitemap["chains"])
print(f"The chains specified in the sitemap are {sitemap_chains=}")
if sitemap_chains - set(included_chains):
    print(
        "Removing the following sitemap chains which are not in included-chains:\n"
        + str(sitemap_chains - set(included_chains))
    )
    sitemap = sitemap.assign(
        chains=lambda x: x["chains"].map(
            lambda cs: [c for c in cs if c in included_chains]
        )
    )
elif set(included_chains) - sitemap_chains:
    raise ValueError(f"some {included_chains=} not in {sitemap_chains=}")
sitemap = sitemap.assign(chains=lambda x: x["chains"].map(lambda c: " ".join(c)))

# get wildtype residue at each site
records = []
for chain in included_chains:
    for res in pdb_obj[chain].get_residues():
        if not res.id[0].isspace():
            continue
        aa = Bio.PDB.Polypeptide.protein_letters_3to1[res.resname]
        r = str(res.id[1])
        records.append((chain, r, aa))
pdb_df = pd.DataFrame(records, columns=["chain", "protein_site", "pdb_aa"])
assert pdb_df["protein_site"].dtype == sitemap["protein_site"].dtype

# compare wildtypes in DMS data and actual PDB
for chainset, chainset_df in sitemap.query("chains != ''").groupby("chains", dropna=True):
    chainset = chainset.split()
    print(f"\nComparing wildtype residues in DMS data and PDB for {chainset=}")
    pdb_chainset_df = (
        pdb_df
        .query("chain in @chainset")
        .groupby("protein_site", as_index=False)
        .aggregate(
            pdb_aa=pd.NamedAgg("pdb_aa", lambda s: " ".join(s.unique())),
            n_pdb_aas=pd.NamedAgg("pdb_aa", "nunique"),
        )
    )
    if (pdb_chainset_df["n_pdb_aas"] != 1).any():
        raise ValueError(
            "PDB does not have same amino-acid at all chains in {chainset=}:\n"
            + str(pdb_chainset_df.query("n_pdb_aas != 1"))
        )
    df = (
        chainset_df
        .merge(
            pdb_chainset_df.drop(columns="n_pdb_aas"),
            on="protein_site",
            how="outer",
            validate="one_to_one",
        )
        .assign(
            differ=lambda x: (
                (x["wildtype"] != x["pdb_aa"])
                & x["pdb_aa"].notnull()
                & x["wildtype"].notnull()
            )
        )
        .sort_values("sequential_site")
    )
    assert (df["wildtype"].notnull() | df["pdb_aa"].notnull()).all()
    print(
        "Comparing wildtype amino acids for sites with DMS data versus sites in PDB:\n"
        f"  - sites with same amino acid: {sum(df['wildtype'] == df['pdb_aa'])}\n"
        f"  - sites with different amino acid: {df['differ'].sum()}\n"
        f"  - sites missing in DMS data: {df['wildtype'].isnull().sum()}\n"
        f"  - sites missing in PDB: {df['pdb_aa'].isnull().sum()}"
    )
    if df["differ"].any():
        print("Here are sites with different amino acids between DMS data and PDB:")
        display(df.query("differ").drop(columns="differ").reset_index(drop=True))
PDB has all_chains=['M', 'N', 'O', 'P', 'A', 'E', 'I', 'B', 'F', 'J', 'C', 'G', 'K', 'D', 'H', 'L']
Included chains for mapping DMS data: included_chains=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
The chains specified in the sitemap are sitemap_chains={'H', 'F', 'G', 'B', 'C', 'D', 'A', 'E'}

Comparing wildtype residues in DMS data and PDB for chainset=['A', 'B', 'C', 'D']
Comparing wildtype amino acids for sites with DMS data versus sites in PDB:
  - sites with same amino acid: 423
  - sites with different amino acid: 16
  - sites missing in DMS data: 0
  - sites missing in PDB: 0
Here are sites with different amino acids between DMS data and PDB:
reference_site sequential_site protein_site chains wildtype pdb_aa
0 34(E1) 583 34 A B C D L Q
1 98(E1) 647 98 A B C D T A
2 142(E1) 691 142 A B C D V I
3 145(E1) 694 145 A B C D S A
4 162(E1) 711 162 A B C D I V
5 211(E1) 760 211 A B C D E K
6 225(E1) 774 225 A B C D S A
7 269(E1) 818 269 A B C D M V
8 276(E1) 825 276 A B C D M I
9 296(E1) 845 296 A B C D L V
10 321(E1) 870 321 A B C D A T
11 343(E1) 892 343 A B C D E D
12 344(E1) 893 344 A B C D I V
13 379(E1) 928 379 A B C D E A
14 404(E1) 953 404 A B C D V T
15 420(E1) 969 420 A B C D V I
Comparing wildtype residues in DMS data and PDB for chainset=['E', 'F', 'G', 'H']
Comparing wildtype amino acids for sites with DMS data versus sites in PDB:
  - sites with same amino acid: 389
  - sites with different amino acid: 30
  - sites missing in DMS data: 0
  - sites missing in PDB: 4
Here are sites with different amino acids between DMS data and PDB:
reference_site sequential_site protein_site chains wildtype pdb_aa
0 12(E2) 77 12 E F G H I T
1 32(E2) 97 32 E F G H V I
2 72(E2) 137 72 E F G H N S
3 74(E2) 139 74 E F G H M T
4 82(E2) 147 82 E F G H R G
5 84(E2) 149 84 E F G H F L
6 118(E2) 183 118 E F G H G S
7 124(E2) 189 124 E F G H S T
8 132(E2) 197 132 E F G H D E
9 140(E2) 205 140 E F G H K R
10 149(E2) 214 149 E F G H R K
11 157(E2) 222 157 E F G H A V
12 182(E2) 247 182 E F G H S T
13 194(E2) 259 194 E F G H S G
14 205(E2) 270 205 E F G H D G
15 222(E2) 287 222 E F G H V I
16 234(E2) 299 234 E F G H K N
17 255(E2) 320 255 E F G H V I
18 284(E2) 349 284 E F G H I T
19 302(E2) 367 302 E F G H E Q
20 307(E2) 372 307 E F G H Q H
21 317(E2) 382 317 E F G H I V
22 318(E2) 383 318 E F G H R T
23 342(E2) 407 342 E F G H L M
24 370(E2) 435 370 E F G H V I
25 384(E2) 449 384 E F G H V T
26 390(E2) 455 390 E F G H M V
27 415(E2) 480 415 E F G H I L
28 418(E2) 483 418 E F G H I V
29 421(E2) 486 421 E F G H A T

Run configure-dms-viz format¶

Run configure-dms-viz format, see here for the command line options.

First, prep the input data:

In [9]:
# allowed optional params from:
# https://dms-viz.github.io/dms-viz-docs/preparing-data/command-line-api/#configure-dms-viz-format
allowed_params = [
    "condition",
    "metric-name",
    "condition-name",
    "tooltip-cols",
    "filter-cols",
    "filter-limits",
    "heatmap-limits",
    "included-chains",
    "excluded-chains",
    "alphabet",
    "colors",
    "negative-colors",
    "exclude-amino-acids",
    "description",
    "title",
    "floor",
    "summary-stat",
]

if not set(opt_params).issubset(allowed_params):
    raise ValueError(f"invalid `opt_params` {set(opt_params) - set(allowed_params)=}")

# get required columns
data_cols = ["site", "wildtype", "mutant"]
for param in ["tooltip-cols", "filter-cols"]:
    if param in opt_params:
        data_cols += opt_params[param]
if not set(data_cols).issubset(data.columns):
    raise ValueError(f"{data_cols=} not in {data.columns=}")

# melt data if needed
if melt_condition_metric_cols:
    order_map = {c: i for (i, c) in enumerate(melt_condition_metric_cols)}
    if not set(melt_condition_metric_cols).issubset(data.columns):
        raise ValueError(f"{melt_condition_metric_cols} not in {data.columns=}")
    assert melt_condition_metric_cols, f"{melt_condition_metric_cols=}"
    if "condition" not in opt_params:
        raise ValueError(
            f"`melt_condition_metric_cols` requires 'condition' in {opt_params=}"
        )
    data_to_use = (
        data.melt(
            id_vars=data_cols,
            value_vars=melt_condition_metric_cols,
            var_name=opt_params["condition"],
            value_name=metric,
        )
        .sort_values(opt_params["condition"], key=lambda col: col.map(order_map))
    )
else:
    data_to_use = data
data_cols.append(metric)
if "condition" in opt_params:
    data_cols.append(opt_params["condition"])
if not set(data_cols).issubset(data_to_use.columns):
    raise ValueError(f"{data_cols=} not in {data_to_use.columns=}")

# write the sitemap dropping null protein_site columns
print(f"Writing sitemap with null protein sites dropped to {input_sitemap_csv=}")
sitemap.query("protein_site.notnull()").to_csv(input_sitemap_csv, index=False)

# write data to file to use as input, keeping only relevant columns with a protein site
print(f"Writing data for {data_cols=} to {input_data_csv=}")
(
    data_to_use
    [data_to_use["site"].isin(set(sitemap.query("protein_site.notnull()")["reference_site"]))]
    [data_cols]
    .to_csv(input_data_csv, index=False, float_format="%.4g")
)
Writing sitemap with null protein sites dropped to input_sitemap_csv='results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_sitemap.csv'
Writing data for data_cols=['site', 'wildtype', 'mutant', 'sequential_site', 'region', 'entry in 293T_Mxra8 cells', 'entry_in_293T_Mxra8_cells', 'Mxra8 binding', 'Mxra8_type'] to input_data_csv='results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_data.csv'

Get the colors to use; if there are more than 4 we need to define more colors as configure-dms-viz only handles up to four by default:

In [10]:
def get_hex_color_palette(num_colors):
    colors = seaborn.color_palette("hls", num_colors)
    hex_colors = [matplotlib.colors.to_hex(color) for color in colors]
    return hex_colors

if "condition" in opt_params:
    nconditions = data_to_use[opt_params["condition"]].nunique()
else:
    nconditions = 1

opt_params = opt_params.copy()

if "colors" not in opt_params:
    opt_params["colors"] = ",".join(get_hex_color_palette(nconditions))

Now assemble the commands and run the program:

In [11]:
assert set(opt_params).issubset(allowed_params)

cmds = [
    "configure-dms-viz",
    "format",
    "--name", name,
    "--input", input_data_csv,
    "--metric", metric,
    "--sitemap", input_sitemap_csv,
    "--structure", pdb_file,
    "--output", dms_viz_json,
]
for key, val in opt_params.items():
    cmds += [f"--{key}", str(val)]

print(f"Running the following cmds:\n{' '.join(cmds)}\n")
res = subprocess.run(cmds, capture_output=True, text=True)
res_str = []
for attr in ["returncode", "args", "stdout", "stderr"]:
    res_str.append(f"{attr}:\n{getattr(res, attr)}")
res_str = "\n\n".join(res_str)
if res.returncode != 0:
    raise RuntimeError(f"Command failed:\n\n{res_str}")
else:
    print(f"Command succeeded:\n\n{res_str}")
Running the following cmds:
configure-dms-viz format --name 293T-Mxra8 entry --input results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_data.csv --metric Mxra8 binding --sitemap results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_sitemap.csv --structure results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.pdb --output results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.json --condition Mxra8_type --included-chains A B C D E F G H --excluded-chains I J K L --alphabet RKHDEQNSTYWFAILMVGPC --floor False --summary-stat sum --filter-cols {'entry_in_293T_Mxra8_cells': '293T-Mxra8 entry'} --filter-limits {'entry_in_293T_Mxra8_cells': [-8, -4, 0]} --tooltip-cols {'sequential_site': 'sequential site', 'region': 'region', 'entry in 293T_Mxra8 cells': '293T-Mxra8 entry'} --heatmap-limits -4, 0, 3 --colors #db5f57,#57d3db

Command succeeded:

returncode:
0

args:
['configure-dms-viz', 'format', '--name', '293T-Mxra8 entry', '--input', 'results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_data.csv', '--metric', 'Mxra8 binding', '--sitemap', 'results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_sitemap.csv', '--structure', 'results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.pdb', '--output', 'results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.json', '--condition', 'Mxra8_type', '--included-chains', 'A B C D E F G H', '--excluded-chains', 'I J K L', '--alphabet', 'RKHDEQNSTYWFAILMVGPC', '--floor', 'False', '--summary-stat', 'sum', '--filter-cols', "{'entry_in_293T_Mxra8_cells': '293T-Mxra8 entry'}", '--filter-limits', "{'entry_in_293T_Mxra8_cells': [-8, -4, 0]}", '--tooltip-cols', "{'sequential_site': 'sequential site', 'region': 'region', 'entry in 293T_Mxra8 cells': '293T-Mxra8 entry'}", '--heatmap-limits', '-4, 0, 3', '--colors', '#db5f57,#57d3db']

stdout:

Formatting data for visualization using the 'Mxra8 binding' column from 'results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_data.csv'...

Using sitemap from 'results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit_sitemap.csv'.

Warning: NaN values were found in the metric column. These rows will be filtered out.

Warning: There are sites where there are no mutations, in other words, only the wildtype residue is present in the mutation column for that site. These rows will be filtered out.
Three values were provided for heatmap limits, these will be the min, center, and max of the scale.

Warning: The 'entry_in_293T_Mxra8_cells' filter limit '-8' is less than the minimum value of -7.712. Setting the min value to -7.712.

About 94.56% (799 of 845) of the wildtype residues in the data match the corresponding residues in the structure.
About 0.47% (4 of 849) of the data sites are missing from the structure.

Success! The visualization JSON was written to 'results/dms-viz/mxra8_binding_on_6nk6_asymmetric_unit/mxra8_binding_on_6nk6_asymmetric_unit.json'


stderr: