######## snakemake preamble start (automatically inserted, do not edit) ########
import sys;sys.path.extend(['/home/ccarr/miniforge3/envs/dms-vep-pipeline-3/lib/python3.11/site-packages', '/fh/fast/bloom_j/computational_notebooks/ccarr/2024/RABV_Pasteur_G_DMS/dms-vep-pipeline-3/..', '/fh/fast/bloom_j/computational_notebooks/ccarr/2024/RABV_Pasteur_G_DMS/dms-vep-pipeline-3', '/home/ccarr/miniforge3/envs/dms-vep-pipeline-3/bin', '/home/ccarr/miniforge3/envs/dms-vep-pipeline-3/lib/python3.11', '/home/ccarr/miniforge3/envs/dms-vep-pipeline-3/lib/python3.11/lib-dynload', '/home/ccarr/miniforge3/envs/dms-vep-pipeline-3/lib/python3.11/site-packages', '/home/ccarr/.cache/snakemake/snakemake/source-cache/runtime-cache/tmp6zmc2gqj/file/fh/fast/bloom_j/computational_notebooks/ccarr/2024/RABV_Pasteur_G_DMS/notebooks', '/fh/fast/bloom_j/computational_notebooks/ccarr/2024/RABV_Pasteur_G_DMS/notebooks']);import pickle;from snakemake import script;script.snakemake = pickle.loads(b'\x80\x04\x95\xcd\n\x00\x00\x00\x00\x00\x00\x8c\x10snakemake.script\x94\x8c\tSnakemake\x94\x93\x94)\x81\x94}\x94(\x8c\x05input\x94\x8c\x0csnakemake.io\x94\x8c\nInputFiles\x94\x93\x94)\x81\x94(\x8cGresults/summaries/all_antibodies_and_cell_entry_per_antibody_escape.csv\x94\x8c3results/summaries/all_antibodies_and_cell_entry.csv\x94e}\x94(\x8c\x06_names\x94}\x94(\x8c\nescape_csv\x94K\x00N\x86\x94\x8c\x0ephenotypes_csv\x94K\x01N\x86\x94u\x8c\x12_allowed_overrides\x94]\x94(\x8c\x05index\x94\x8c\x04sort\x94eh\x15h\x06\x8c\x0eAttributeGuard\x94\x93\x94)\x81\x94}\x94\x8c\x04name\x94h\x15sbh\x16h\x18)\x81\x94}\x94h\x1bh\x16sbh\x0fh\nh\x11h\x0bub\x8c\x06output\x94h\x06\x8c\x0bOutputFiles\x94\x93\x94)\x81\x94\x8c\x14results/escape_logos\x94a}\x94(h\r}\x94\x8c\x0flogoplot_subdir\x94K\x00N\x86\x94sh\x13]\x94(h\x15h\x16eh\x15h\x18)\x81\x94}\x94h\x1bh\x15sbh\x16h\x18)\x81\x94}\x94h\x1bh\x16sbh%h"ub\x8c\x06params\x94h\x06\x8c\x06Params\x94\x93\x94)\x81\x94}\x94(h\r}\x94h\x13]\x94(h\x15h\x16eh\x15h\x18)\x81\x94}\x94h\x1bh\x15sbh\x16h\x18)\x81\x94}\x94h\x1bh\x16sbub\x8c\twildcards\x94h\x06\x8c\tWildcards\x94\x93\x94)\x81\x94}\x94(h\r}\x94h\x13]\x94(h\x15h\x16eh\x15h\x18)\x81\x94}\x94h\x1bh\x15sbh\x16h\x18)\x81\x94}\x94h\x1bh\x16sbub\x8c\x07threads\x94K\x01\x8c\tresources\x94h\x06\x8c\tResources\x94\x93\x94)\x81\x94(K\x01K\x01\x8c\x14/loc/scratch/2402969\x94e}\x94(h\r}\x94(\x8c\x06_cores\x94K\x00N\x86\x94\x8c\x06_nodes\x94K\x01N\x86\x94\x8c\x06tmpdir\x94K\x02N\x86\x94uh\x13]\x94(h\x15h\x16eh\x15h\x18)\x81\x94}\x94h\x1bh\x15sbh\x16h\x18)\x81\x94}\x94h\x1bh\x16sbhJK\x01hLK\x01hNhGub\x8c\x03log\x94h\x06\x8c\x03Log\x94\x93\x94)\x81\x94\x8c$results/notebooks/escape_logos.ipynb\x94a}\x94(h\r}\x94\x8c\x08notebook\x94K\x00N\x86\x94sh\x13]\x94(h\x15h\x16eh\x15h\x18)\x81\x94}\x94h\x1bh\x15sbh\x16h\x18)\x81\x94}\x94h\x1bh\x16sbh\\hYub\x8c\x06config\x94}\x94(\x8c\rpipeline_path\x94\x8c\x12dms-vep-pipeline-3\x94\x8c\x04docs\x94\x8c\x04docs\x94\x8c\x0fgithub_repo_url\x94\x8c-https://github.com/dms-vep/RABV_Pasteur_G_DMS\x94\x8c\x0fgithub_blob_url\x94\x8c7https://github.com/dms-vep/RABV_Pasteur_G_DMS/blob/main\x94\x8c\x0bdescription\x94\x8ceDeep mutational scanning using Rabies glycoprotein (strain Pasteur). Analysis uses dms-vep-pipeline-3\x94\x8c\x04year\x94M\xe8\x07\x8c\x07authors\x94\x8c\x1dArjun Aditham and Jesse Bloom\x94\x8c\x12site_numbering_map\x94\x8c\x1bdata/site_numbering_map.csv\x94\x8c\x1emutation_design_classification\x94}\x94(\x8c\x03csv\x94\x8c\x1bdata/designed_mutations.csv\x94\x8c\x08site_col\x94\x8c\x0fsequential_site\x94u\x8c\x16neut_standard_barcodes\x94\x8c)data/neutralization_standard_barcodes.csv\x94\x8c\x11prebuilt_variants\x94N\x8c\x10prebuilt_geneseq\x94N\x8c\x0bpacbio_runs\x94\x8c\x14data/PacBio_runs.csv\x94\x8c\x0fpacbio_amplicon\x94\x8c\x17data/PacBio_amplicon.gb\x94\x8c\x15pacbio_amplicon_specs\x94\x8c$data/PacBio_feature_parse_specs.yaml\x94\x8c\x0cvariant_tags\x94}\x94(\x8c\x0cvariant_tag5\x94}\x94(\x8c\tvariant_1\x94\x8c\x01G\x94\x8c\tvariant_2\x94\x8c\x01C\x94\x8c\x08wildtype\x94\x8c\x01A\x94u\x8c\x0cvariant_tag3\x94}\x94(\x8c\tvariant_1\x94h\x89\x8c\tvariant_2\x94h\x8b\x8c\x08wildtype\x94h\x8duu\x8c\x12max_ccs_error_rate\x94G?\x1a6\xe2\xeb\x1cC-\x8c\x10consensus_params\x94}\x94(\x8c\rmax_sub_diffs\x94N\x8c\x0fmax_indel_diffs\x94N\x8c\x12max_minor_sub_frac\x94G?\xc9\x99\x99\x99\x99\x99\x9a\x8c\x14max_minor_indel_frac\x94G?\xc9\x99\x99\x99\x99\x99\x9a\x8c\x0bmin_support\x94K\x03u\x8c\x13gene_sequence_codon\x94\x8c\x1edata/gene_sequence/codon.fasta\x94\x8c\x15gene_sequence_protein\x94\x8c data/gene_sequence/protein.fasta\x94\x8c\x0ecodon_variants\x94\x8c#results/variants/codon_variants.csv\x94\x8c\x0cbarcode_runs\x94\x8c\x15data/barcode_runs.csv\x94\x8c\x12duplicate_fastq_R1\x94\x8c\x04warn\x94\x8c\x1eillumina_barcode_parser_params\x94}\x94(\x8c\x08upstream\x94\x8c"ACTCCACTAGGAACATTTCTCTCTCGAATCTAGA\x94\x8c\ndownstream\x94\x8c\x00\x94\x8c\x04minq\x94K\x14\x8c\x11upstream_mismatch\x94K\x02u\x8c\x13func_effects_config\x94\x8c\x1cdata/func_effects_config.yml\x94\x8c\x16antibody_escape_config\x94\x8c\x1fdata/antibody_escape_config.yml\x94\x8c\x10summaries_config\x94\x8c\x19data/summaries_config.yml\x94\x8c\x0edms_viz_config\x94\x8c\x17data/dms_viz_config.yml\x94u\x8c\x04rule\x94\x8c\x0cescape_logos\x94\x8c\x0fbench_iteration\x94N\x8c\tscriptdir\x94\x8cP/fh/fast/bloom_j/computational_notebooks/ccarr/2024/RABV_Pasteur_G_DMS/notebooks\x94ub.');del script;from snakemake.logging import logger;from snakemake.script import snakemake; logger.printshellcmds = False;import os; os.chdir(r'/fh/fast/bloom_j/computational_notebooks/ccarr/2024/RABV_Pasteur_G_DMS');
######## snakemake preamble end #########
Make logoplots showing key sites of escape for each antibody¶
Get variables from snakemake
:
escape_csv = snakemake.input.escape_csv
phenotypes_csv = snakemake.input.phenotypes_csv
logoplot_subdir = snakemake.output.logoplot_subdir
Import Python modules:
import functools
import itertools
import operator
import os
import altair as alt
import dmslogo
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
_ = alt.data_transformers.disable_max_rows()
os.makedirs(logoplot_subdir, exist_ok=True)
Some configuration options. Set configuration here, do not hardcode it in the notebook in later cells.
positive_escape_only = True # only show positive escape values
min_cell_entry = -5 # minimum cell-entry effects worse than this are set to this
drop_low_cell_entry = True # drop escape for any mutations with cell entry below this?
# highlight sites that meet any of these criteria (an OR operation)
highlight_params = {
"n_site_escape": 6, # the top this many sites in terms of site_escape
"n_top_mut_escape": 6, # the top this many sites in terms of top site escape
"min_site_escape": 12, # site escape >= this
"min_top_mut_escape": 2.5, # top mutation escape >= this
}
# amino acids to plot
aas = list(dmslogo.colorschemes.AA_CHARGE)
# line plot xtick locations
lineplot_xticks = [0, 100, 200, 300, 400]
# make logo plots with each of these antibody groups in addition to per-antibody logos
antibody_groups = {
"region_3_antibodies": ["17C7", "CR4098", "RVA122", "RVC58"],
"region_1_antibodies": ["RVC20", "CR57"],
}
# sites to exclude from highlighting, do not show these sites even if they
# meet highlight criteria. Key by antibody with value lists of sites
sites_no_highlight = {
}
# file extensions on saved plots
file_extensions = [".svg", ".pdf"]
Read escape values. Mutations with missing escape (or that have cell entry below threshold if dropping those) is set to zero. Setting these to zero is the same as removing them for the logoplots and site-escape sum plots used here, but if you use other plots you should bae aware that missing mutations have escape of zero in this dataframe.
escape = (
pd.read_csv(escape_csv)
[["antibody", "site", "wildtype", "mutant", "escape"]]
.merge(
pd.read_csv(phenotypes_csv)[["site", "sequential_site", "wildtype", "mutant", "cell entry"]],
on=["site", "wildtype", "mutant"],
how="left",
validate="many_to_one",
)
.query("mutant in @aas")
)
if drop_low_cell_entry:
print(f"Setting to zero escape for mutations with cell entry values below {min_cell_entry=}")
escape["escape"] = escape["escape"].where(escape["cell entry"] >= min_cell_entry, 0)
print(f"Flooring cell entry values to {min_cell_entry=}")
escape["cell entry"] = escape["cell entry"].clip(lower=min_cell_entry)
if positive_escape_only:
print("Setting negative escape values to zero")
escape["escape"] = escape["escape"].clip(lower=0)
# pad missing escape values to zero
antibodies = escape["antibody"].unique().tolist()
print(f"Read escape for the following {len(antibodies)}:\n{antibodies=}")
escape_fill_zero = (
pd.DataFrame(
[[*tup, 0] for tup in itertools.product(escape["sequential_site"].unique(), aas, antibodies)],
columns=["sequential_site", "mutant", "antibody", "escape"]
)
.merge(
escape[["sequential_site", "site", "wildtype"]].drop_duplicates(),
on="sequential_site",
validate="many_to_one",
)
)
escape = (
escape
[["sequential_site", "mutant", "antibody", "cell entry", "escape"]]
.merge(
escape_fill_zero,
on=["sequential_site", "mutant", "antibody"],
how="outer",
validate="one_to_one",
)
.assign(
**{
"cell entry": lambda x: x["cell entry"].fillna(0),
"escape": lambda x: x["escape_x"].where(x["escape_x"].notnull(), x["escape_y"]),
}
)
.drop(columns=["escape_x", "escape_y"])
)
Setting to zero escape for mutations with cell entry values below min_cell_entry=-5 Flooring cell entry values to min_cell_entry=-5 Setting negative escape values to zero Read escape for the following 8: antibodies=['17C7', 'CR4098', 'CR57', 'CTB012', 'RVA122', 'RVC20', 'RVC58', 'RVC68']
To choose which sites to highlight, we get the total magnitude of the site escape and largest magnitude mutation at each site for each antibody. Then plot to indicate which sites are being shown:
# get total and top magnitude escape at each site
per_site_escape = (
escape
.groupby(["antibody", "site"], as_index=False)
.aggregate(
site_escape=pd.NamedAgg("escape", lambda s: s.abs().sum()),
top_mut_escape=pd.NamedAgg("escape", lambda s: s.abs().max()),
)
.melt(id_vars=["antibody", "site"], var_name="escape_type", value_name="escape")
.assign(
rank=lambda x: (
x.groupby(["antibody", "escape_type"])["escape"].rank(
method="min", ascending=False
).astype(int)
)
)
.merge(
(
pd.DataFrame(sites_no_highlight.items(), columns=["antibody", "site"])
.explode("site")
.assign(manual_no_highlight=True)
),
on=["antibody", "site"],
validate="many_to_one",
how="left",
)
.assign(manual_no_highlight=lambda x: x["manual_no_highlight"].fillna(False))
)
/loc/scratch/2402969/ipykernel_29054/1397458998.py:27: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` .assign(manual_no_highlight=lambda x: x["manual_no_highlight"].fillna(False))
# indicate the sites to highlight
per_site_escape["highlight"] = ~per_site_escape["manual_no_highlight"] & functools.reduce(
operator.or_,
[
(
(per_site_escape["escape_type"] == escape_type)
& (
(per_site_escape["escape"] >= highlight_params[f"min_{escape_type}"])
| (per_site_escape["rank"] <= highlight_params[f"n_{escape_type}"])
)
)
for escape_type in ["site_escape", "top_mut_escape"]
]
)
print("Number of sites to highlight for each antibody:")
per_site_escape.query("highlight").groupby("antibody").aggregate(
n_sites_to_highlight=pd.NamedAgg("site", "nunique")
)
Number of sites to highlight for each antibody:
n_sites_to_highlight | |
---|---|
antibody | |
17C7 | 7 |
CR4098 | 8 |
CR57 | 7 |
CTB012 | 24 |
RVA122 | 8 |
RVC20 | 7 |
RVC58 | 7 |
RVC68 | 6 |
# make plot
site_selection = alt.selection_point(fields=["site"], on="mouseover", empty=False)
per_site_escape_chart = (
alt.Chart(per_site_escape)
.add_params(site_selection)
.transform_calculate(jitter="random() - 0.5")
.encode(
alt.X("escape", title=None),
alt.Y("antibody"),
alt.YOffset("jitter:Q", scale=alt.Scale(domain=[-1.2, 1.2])),
alt.Column(
"escape_type",
title=None,
header=alt.Header(orient="bottom", labelFontSize=11, labelFontStyle="bold"),
),
alt.Color("highlight", title="site to highlight"),
tooltip=["site", "rank", "antibody", "manual_no_highlight"],
strokeWidth=alt.condition(site_selection, alt.value(3), alt.value(0.25)),
)
.mark_circle(size=40, strokeOpacity=1, fillOpacity=0.4, stroke="black")
.resolve_scale(x="independent")
.properties(width=275, height=alt.Step(30))
.configure_axis(grid=False)
)
per_site_escape_chart
Add the sites to highlight for each antibody to the escape data frame, and get the total positive and negative escape at each site:
escape = (
escape
.assign(
positive_site_escape=(
lambda x: (
x.groupby(["antibody", "site"])["escape"].transform(lambda s: s[s > 0].sum())
)
),
negative_site_escape=(
lambda x: (
x.groupby(["antibody", "site"])["escape"].transform(lambda s: s[s < 0].sum())
)
),
)
.drop(columns="highlight", errors="ignore")
.merge(
per_site_escape.groupby(["antibody", "site"], as_index=False).aggregate(
{"highlight": "any"}
),
on=["antibody", "site"],
validate="many_to_one",
)
)
Now make line and logo plots for each antibody.
First make color scale to color logos by cell entry:
colormap = dmslogo.colorschemes.ValueToColorMap(
minvalue=min_cell_entry,
maxvalue=0,
cmap=matplotlib.colors.LinearSegmentedColormap.from_list(
"white_to_green",
[(1,0.985,0.737), (0.0545, 0.4313, 0.054)],
),
)
for orientation in ["vertical", "horizontal"]:
assert colormap.minvalue == int(colormap.minvalue), "code requires integer minvalue for color scale"
assert colormap.maxvalue == int(colormap.maxvalue), "code requires integer maxvalue for color scale"
scale_fig, scale_ax = colormap.scale_bar(
orientation=orientation,
label="effect on cell entry",
)
ticks = list(range(int(colormap.minvalue), int(colormap.maxvalue) + 1))
ticklabels = [f"≤{ticks[0]}"] + [str(t) for t in ticks[1: -1]] + [f"≥{ticks[-1]}"]
ax = scale_ax.xaxis if orientation == "horizontal" else scale_ax.yaxis
ax.set_ticks(ticks)
ax.set_ticklabels(ticklabels)
display(scale_fig)
for ext in file_extensions:
fname = os.path.join(logoplot_subdir, f"scalebar_{orientation}{ext}")
print(f"Saving to {fname}")
scale_fig.savefig(fname)
plt.close(scale_fig)
escape["letter_color"] = colormap.val_to_color(
escape["cell entry"].clip(lower=colormap.minvalue, upper=colormap.maxvalue)
)
Saving to results/escape_logos/scalebar_vertical.svg Saving to results/escape_logos/scalebar_vertical.pdf
Saving to results/escape_logos/scalebar_horizontal.svg Saving to results/escape_logos/scalebar_horizontal.pdf
Draw plots for each individual antibody:
for antibody_group, antibodies_to_plot in (
antibody_groups | {a: [a] for a in antibodies}
).items():
print(f"\nMaking plot for {antibody_group=} with {antibodies_to_plot=}")
df = (
escape
.query("antibody in @antibodies_to_plot")
.assign(highlight=lambda x: x.groupby("site")["highlight"].transform("any"))
)
fig, axes = dmslogo.facet_plot(
df,
x_col="sequential_site",
show_col="highlight",
gridrow_col="antibody",
draw_line_kwargs={
"height_col": "positive_site_escape",
"height_col2": None if positive_escape_only else "negative_site_escape",
"xtick_col": "site",
"ylabel": "escape",
"widthscale": 0.3,
},
draw_logo_kwargs={
"letter_col": "mutant",
"letter_height_col": "escape",
"xtick_col": "site",
"color_col": "letter_color",
"widthscale": 0.7,
},
share_xlabel=True,
share_ylabel=True,
height_per_ax=1.9,
wspace=0.6,
share_ylim_across_rows=False,
)
# adjust x-ticks on line plots
assert axes.shape[1] == 2
for i in range(axes.shape[0]):
axes[i, 0].xaxis.set_ticks(lineplot_xticks)
if i == axes.shape[0] - 1:
axes[i, 0].xaxis.set_ticklabels(lineplot_xticks)
for ext in file_extensions:
fname = os.path.join(logoplot_subdir, f"{antibody_group}{ext}")
print(f"Saving to {fname}")
fig.savefig(fname)
display(fig)
plt.close(fig)
Making plot for antibody_group='region_3_antibodies' with antibodies_to_plot=['17C7', 'CR4098', 'RVA122', 'RVC58']
Saving to results/escape_logos/region_3_antibodies.svg
Saving to results/escape_logos/region_3_antibodies.pdf
Making plot for antibody_group='region_1_antibodies' with antibodies_to_plot=['RVC20', 'CR57']
Saving to results/escape_logos/region_1_antibodies.svg
Saving to results/escape_logos/region_1_antibodies.pdf
Making plot for antibody_group='17C7' with antibodies_to_plot=['17C7']
Saving to results/escape_logos/17C7.svg Saving to results/escape_logos/17C7.pdf
Making plot for antibody_group='CR4098' with antibodies_to_plot=['CR4098']
Saving to results/escape_logos/CR4098.svg Saving to results/escape_logos/CR4098.pdf
Making plot for antibody_group='CR57' with antibodies_to_plot=['CR57']
Saving to results/escape_logos/CR57.svg Saving to results/escape_logos/CR57.pdf
Making plot for antibody_group='CTB012' with antibodies_to_plot=['CTB012']
Saving to results/escape_logos/CTB012.svg
Saving to results/escape_logos/CTB012.pdf
Making plot for antibody_group='RVA122' with antibodies_to_plot=['RVA122']
Saving to results/escape_logos/RVA122.svg Saving to results/escape_logos/RVA122.pdf
Making plot for antibody_group='RVC20' with antibodies_to_plot=['RVC20']
Saving to results/escape_logos/RVC20.svg Saving to results/escape_logos/RVC20.pdf
Making plot for antibody_group='RVC58' with antibodies_to_plot=['RVC58']
Saving to results/escape_logos/RVC58.svg Saving to results/escape_logos/RVC58.pdf
Making plot for antibody_group='RVC68' with antibodies_to_plot=['RVC68']
Saving to results/escape_logos/RVC68.svg Saving to results/escape_logos/RVC68.pdf