######## snakemake preamble start (automatically inserted, do not edit) ########
import sys;sys.path.extend(['/home/ccarr/miniforge3/envs/dms-vep-pipeline-3/lib/python3.11/site-packages', '/fh/fast/bloom_j/computational_notebooks/ccarr/2024/RABV_Pasteur_G_DMS/dms-vep-pipeline-3/..', '/fh/fast/bloom_j/computational_notebooks/ccarr/2024/RABV_Pasteur_G_DMS/dms-vep-pipeline-3', '/home/ccarr/miniforge3/envs/dms-vep-pipeline-3/bin', '/home/ccarr/miniforge3/envs/dms-vep-pipeline-3/lib/python3.11', '/home/ccarr/miniforge3/envs/dms-vep-pipeline-3/lib/python3.11/lib-dynload', '/home/ccarr/miniforge3/envs/dms-vep-pipeline-3/lib/python3.11/site-packages', '/home/ccarr/.cache/snakemake/snakemake/source-cache/runtime-cache/tmp6zmc2gqj/file/fh/fast/bloom_j/computational_notebooks/ccarr/2024/RABV_Pasteur_G_DMS/notebooks', '/fh/fast/bloom_j/computational_notebooks/ccarr/2024/RABV_Pasteur_G_DMS/notebooks']);import pickle;from snakemake import script;script.snakemake = pickle.loads(b'\x80\x04\x95\xcd\n\x00\x00\x00\x00\x00\x00\x8c\x10snakemake.script\x94\x8c\tSnakemake\x94\x93\x94)\x81\x94}\x94(\x8c\x05input\x94\x8c\x0csnakemake.io\x94\x8c\nInputFiles\x94\x93\x94)\x81\x94(\x8cGresults/summaries/all_antibodies_and_cell_entry_per_antibody_escape.csv\x94\x8c3results/summaries/all_antibodies_and_cell_entry.csv\x94e}\x94(\x8c\x06_names\x94}\x94(\x8c\nescape_csv\x94K\x00N\x86\x94\x8c\x0ephenotypes_csv\x94K\x01N\x86\x94u\x8c\x12_allowed_overrides\x94]\x94(\x8c\x05index\x94\x8c\x04sort\x94eh\x15h\x06\x8c\x0eAttributeGuard\x94\x93\x94)\x81\x94}\x94\x8c\x04name\x94h\x15sbh\x16h\x18)\x81\x94}\x94h\x1bh\x16sbh\x0fh\nh\x11h\x0bub\x8c\x06output\x94h\x06\x8c\x0bOutputFiles\x94\x93\x94)\x81\x94\x8c\x14results/escape_logos\x94a}\x94(h\r}\x94\x8c\x0flogoplot_subdir\x94K\x00N\x86\x94sh\x13]\x94(h\x15h\x16eh\x15h\x18)\x81\x94}\x94h\x1bh\x15sbh\x16h\x18)\x81\x94}\x94h\x1bh\x16sbh%h"ub\x8c\x06params\x94h\x06\x8c\x06Params\x94\x93\x94)\x81\x94}\x94(h\r}\x94h\x13]\x94(h\x15h\x16eh\x15h\x18)\x81\x94}\x94h\x1bh\x15sbh\x16h\x18)\x81\x94}\x94h\x1bh\x16sbub\x8c\twildcards\x94h\x06\x8c\tWildcards\x94\x93\x94)\x81\x94}\x94(h\r}\x94h\x13]\x94(h\x15h\x16eh\x15h\x18)\x81\x94}\x94h\x1bh\x15sbh\x16h\x18)\x81\x94}\x94h\x1bh\x16sbub\x8c\x07threads\x94K\x01\x8c\tresources\x94h\x06\x8c\tResources\x94\x93\x94)\x81\x94(K\x01K\x01\x8c\x14/loc/scratch/2402969\x94e}\x94(h\r}\x94(\x8c\x06_cores\x94K\x00N\x86\x94\x8c\x06_nodes\x94K\x01N\x86\x94\x8c\x06tmpdir\x94K\x02N\x86\x94uh\x13]\x94(h\x15h\x16eh\x15h\x18)\x81\x94}\x94h\x1bh\x15sbh\x16h\x18)\x81\x94}\x94h\x1bh\x16sbhJK\x01hLK\x01hNhGub\x8c\x03log\x94h\x06\x8c\x03Log\x94\x93\x94)\x81\x94\x8c$results/notebooks/escape_logos.ipynb\x94a}\x94(h\r}\x94\x8c\x08notebook\x94K\x00N\x86\x94sh\x13]\x94(h\x15h\x16eh\x15h\x18)\x81\x94}\x94h\x1bh\x15sbh\x16h\x18)\x81\x94}\x94h\x1bh\x16sbh\\hYub\x8c\x06config\x94}\x94(\x8c\rpipeline_path\x94\x8c\x12dms-vep-pipeline-3\x94\x8c\x04docs\x94\x8c\x04docs\x94\x8c\x0fgithub_repo_url\x94\x8c-https://github.com/dms-vep/RABV_Pasteur_G_DMS\x94\x8c\x0fgithub_blob_url\x94\x8c7https://github.com/dms-vep/RABV_Pasteur_G_DMS/blob/main\x94\x8c\x0bdescription\x94\x8ceDeep mutational scanning using Rabies glycoprotein (strain Pasteur). Analysis uses dms-vep-pipeline-3\x94\x8c\x04year\x94M\xe8\x07\x8c\x07authors\x94\x8c\x1dArjun Aditham and Jesse Bloom\x94\x8c\x12site_numbering_map\x94\x8c\x1bdata/site_numbering_map.csv\x94\x8c\x1emutation_design_classification\x94}\x94(\x8c\x03csv\x94\x8c\x1bdata/designed_mutations.csv\x94\x8c\x08site_col\x94\x8c\x0fsequential_site\x94u\x8c\x16neut_standard_barcodes\x94\x8c)data/neutralization_standard_barcodes.csv\x94\x8c\x11prebuilt_variants\x94N\x8c\x10prebuilt_geneseq\x94N\x8c\x0bpacbio_runs\x94\x8c\x14data/PacBio_runs.csv\x94\x8c\x0fpacbio_amplicon\x94\x8c\x17data/PacBio_amplicon.gb\x94\x8c\x15pacbio_amplicon_specs\x94\x8c$data/PacBio_feature_parse_specs.yaml\x94\x8c\x0cvariant_tags\x94}\x94(\x8c\x0cvariant_tag5\x94}\x94(\x8c\tvariant_1\x94\x8c\x01G\x94\x8c\tvariant_2\x94\x8c\x01C\x94\x8c\x08wildtype\x94\x8c\x01A\x94u\x8c\x0cvariant_tag3\x94}\x94(\x8c\tvariant_1\x94h\x89\x8c\tvariant_2\x94h\x8b\x8c\x08wildtype\x94h\x8duu\x8c\x12max_ccs_error_rate\x94G?\x1a6\xe2\xeb\x1cC-\x8c\x10consensus_params\x94}\x94(\x8c\rmax_sub_diffs\x94N\x8c\x0fmax_indel_diffs\x94N\x8c\x12max_minor_sub_frac\x94G?\xc9\x99\x99\x99\x99\x99\x9a\x8c\x14max_minor_indel_frac\x94G?\xc9\x99\x99\x99\x99\x99\x9a\x8c\x0bmin_support\x94K\x03u\x8c\x13gene_sequence_codon\x94\x8c\x1edata/gene_sequence/codon.fasta\x94\x8c\x15gene_sequence_protein\x94\x8c data/gene_sequence/protein.fasta\x94\x8c\x0ecodon_variants\x94\x8c#results/variants/codon_variants.csv\x94\x8c\x0cbarcode_runs\x94\x8c\x15data/barcode_runs.csv\x94\x8c\x12duplicate_fastq_R1\x94\x8c\x04warn\x94\x8c\x1eillumina_barcode_parser_params\x94}\x94(\x8c\x08upstream\x94\x8c"ACTCCACTAGGAACATTTCTCTCTCGAATCTAGA\x94\x8c\ndownstream\x94\x8c\x00\x94\x8c\x04minq\x94K\x14\x8c\x11upstream_mismatch\x94K\x02u\x8c\x13func_effects_config\x94\x8c\x1cdata/func_effects_config.yml\x94\x8c\x16antibody_escape_config\x94\x8c\x1fdata/antibody_escape_config.yml\x94\x8c\x10summaries_config\x94\x8c\x19data/summaries_config.yml\x94\x8c\x0edms_viz_config\x94\x8c\x17data/dms_viz_config.yml\x94u\x8c\x04rule\x94\x8c\x0cescape_logos\x94\x8c\x0fbench_iteration\x94N\x8c\tscriptdir\x94\x8cP/fh/fast/bloom_j/computational_notebooks/ccarr/2024/RABV_Pasteur_G_DMS/notebooks\x94ub.');del script;from snakemake.logging import logger;from snakemake.script import snakemake; logger.printshellcmds = False;import os; os.chdir(r'/fh/fast/bloom_j/computational_notebooks/ccarr/2024/RABV_Pasteur_G_DMS');
######## snakemake preamble end #########

escape_csv = snakemake.input.escape_csv
phenotypes_csv = snakemake.input.phenotypes_csv

logoplot_subdir = snakemake.output.logoplot_subdir

import functools
import itertools
import operator
import os

import altair as alt

import dmslogo

import matplotlib
import matplotlib.pyplot as plt

import pandas as pd

_ = alt.data_transformers.disable_max_rows()

os.makedirs(logoplot_subdir, exist_ok=True)

positive_escape_only = True  # only show positive escape values
min_cell_entry = -5  # minimum cell-entry effects worse than this are set to this
drop_low_cell_entry = True  # drop escape for any mutations with cell entry below this?

# highlight sites that meet any of these criteria (an OR operation)
highlight_params = {
    "n_site_escape": 6,  # the top this many sites in terms of site_escape
    "n_top_mut_escape": 6,  # the top this many sites in terms of top site escape
    "min_site_escape": 12,  # site escape >= this
    "min_top_mut_escape": 2.5,  # top mutation escape >= this
}

# amino acids to plot
aas = list(dmslogo.colorschemes.AA_CHARGE)

# line plot xtick locations
lineplot_xticks = [0, 100, 200, 300, 400]

# make logo plots with each of these antibody groups in addition to per-antibody logos
antibody_groups = {
    "region_3_antibodies": ["17C7", "CR4098", "RVA122", "RVC58"],
    "region_1_antibodies": ["RVC20", "CR57"],   
}

# sites to exclude from highlighting, do not show these sites even if they
# meet highlight criteria. Key by antibody with value lists of sites
sites_no_highlight = {
}

# file extensions on saved plots
file_extensions = [".svg", ".pdf"]

escape = (
    pd.read_csv(escape_csv)
    [["antibody", "site", "wildtype", "mutant", "escape"]]
    .merge(
        pd.read_csv(phenotypes_csv)[["site", "sequential_site", "wildtype", "mutant", "cell entry"]],
        on=["site", "wildtype", "mutant"],
        how="left",
        validate="many_to_one",
    )
    .query("mutant in @aas")
)

if drop_low_cell_entry:
    print(f"Setting to zero escape for mutations with cell entry values below {min_cell_entry=}")
    escape["escape"] = escape["escape"].where(escape["cell entry"] >= min_cell_entry, 0)
    
print(f"Flooring cell entry values to {min_cell_entry=}")
escape["cell entry"] = escape["cell entry"].clip(lower=min_cell_entry)

if positive_escape_only:
    print("Setting negative escape values to zero")
    escape["escape"] = escape["escape"].clip(lower=0)

# pad missing escape values to zero
antibodies = escape["antibody"].unique().tolist()
print(f"Read escape for the following {len(antibodies)}:\n{antibodies=}")
escape_fill_zero = (
    pd.DataFrame(
        [[*tup, 0] for tup in itertools.product(escape["sequential_site"].unique(), aas, antibodies)],
        columns=["sequential_site", "mutant", "antibody", "escape"]
    )
    .merge(
        escape[["sequential_site", "site", "wildtype"]].drop_duplicates(),
        on="sequential_site",
        validate="many_to_one",
    )
)
escape = (
    escape
    [["sequential_site", "mutant", "antibody", "cell entry", "escape"]]
    .merge(
        escape_fill_zero,
        on=["sequential_site", "mutant", "antibody"],
        how="outer",
        validate="one_to_one",
    )
    .assign(
        **{
            "cell entry": lambda x: x["cell entry"].fillna(0),
            "escape": lambda x: x["escape_x"].where(x["escape_x"].notnull(), x["escape_y"]),
        }
    )
    .drop(columns=["escape_x", "escape_y"])
)

Setting to zero escape for mutations with cell entry values below min_cell_entry=-5
Flooring cell entry values to min_cell_entry=-5
Setting negative escape values to zero
Read escape for the following 8:
antibodies=['17C7', 'CR4098', 'CR57', 'CTB012', 'RVA122', 'RVC20', 'RVC58', 'RVC68']

# get total and top magnitude escape at each site
per_site_escape = (
    escape
    .groupby(["antibody", "site"], as_index=False)
    .aggregate(
        site_escape=pd.NamedAgg("escape", lambda s: s.abs().sum()),
        top_mut_escape=pd.NamedAgg("escape", lambda s: s.abs().max()),
    )
    .melt(id_vars=["antibody", "site"], var_name="escape_type", value_name="escape")
    .assign(
        rank=lambda x: (
            x.groupby(["antibody", "escape_type"])["escape"].rank(
                method="min", ascending=False
            ).astype(int)
        )
    )
    .merge(
        (
            pd.DataFrame(sites_no_highlight.items(), columns=["antibody", "site"])
            .explode("site")
            .assign(manual_no_highlight=True)
        ),
        on=["antibody", "site"],
        validate="many_to_one",
        how="left",
    )
    .assign(manual_no_highlight=lambda x: x["manual_no_highlight"].fillna(False))
)

/loc/scratch/2402969/ipykernel_29054/1397458998.py:27: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  .assign(manual_no_highlight=lambda x: x["manual_no_highlight"].fillna(False))

# indicate the sites to highlight
per_site_escape["highlight"] = ~per_site_escape["manual_no_highlight"] & functools.reduce(
    operator.or_,
    [
        (
            (per_site_escape["escape_type"] == escape_type)
            & (
                (per_site_escape["escape"] >= highlight_params[f"min_{escape_type}"])
                | (per_site_escape["rank"] <= highlight_params[f"n_{escape_type}"])
            )
        )
        for escape_type in ["site_escape", "top_mut_escape"]
    ]
)

print("Number of sites to highlight for each antibody:")
per_site_escape.query("highlight").groupby("antibody").aggregate(
    n_sites_to_highlight=pd.NamedAgg("site", "nunique")
)

Number of sites to highlight for each antibody:

# make plot
site_selection = alt.selection_point(fields=["site"], on="mouseover", empty=False) 

per_site_escape_chart = (
    alt.Chart(per_site_escape)
    .add_params(site_selection)
    .transform_calculate(jitter="random() - 0.5")
    .encode(
        alt.X("escape", title=None),
        alt.Y("antibody"),
        alt.YOffset("jitter:Q", scale=alt.Scale(domain=[-1.2, 1.2])),
        alt.Column(
            "escape_type",
            title=None,
            header=alt.Header(orient="bottom", labelFontSize=11, labelFontStyle="bold"),
        ),
        alt.Color("highlight", title="site to highlight"),
        tooltip=["site", "rank", "antibody", "manual_no_highlight"],
        strokeWidth=alt.condition(site_selection, alt.value(3), alt.value(0.25)),
    )
    .mark_circle(size=40, strokeOpacity=1, fillOpacity=0.4, stroke="black")
    .resolve_scale(x="independent")
    .properties(width=275, height=alt.Step(30))
    .configure_axis(grid=False)
)

per_site_escape_chart

escape = (
    escape
    .assign(
        positive_site_escape=(
            lambda x: (
                x.groupby(["antibody", "site"])["escape"].transform(lambda s: s[s > 0].sum())
    
            )
        ),
        negative_site_escape=(
            lambda x: (
                x.groupby(["antibody", "site"])["escape"].transform(lambda s: s[s < 0].sum())
    
            )
        ),
    )
    .drop(columns="highlight", errors="ignore")
    .merge(
        per_site_escape.groupby(["antibody", "site"], as_index=False).aggregate(
            {"highlight": "any"}
        ),
        on=["antibody", "site"],
        validate="many_to_one",
    )
)

colormap = dmslogo.colorschemes.ValueToColorMap(
    minvalue=min_cell_entry,
    maxvalue=0,
    cmap=matplotlib.colors.LinearSegmentedColormap.from_list(
        "white_to_green",
        [(1,0.985,0.737), (0.0545, 0.4313, 0.054)],
    ),
)

for orientation in ["vertical", "horizontal"]:
    assert colormap.minvalue == int(colormap.minvalue), "code requires integer minvalue for color scale"
    assert colormap.maxvalue == int(colormap.maxvalue), "code requires integer maxvalue for color scale"
    scale_fig, scale_ax = colormap.scale_bar(
        orientation=orientation,
        label="effect on cell entry",
    )
    ticks = list(range(int(colormap.minvalue), int(colormap.maxvalue) + 1))
    ticklabels = [f"≤{ticks[0]}"] + [str(t) for t in ticks[1: -1]] + [f"≥{ticks[-1]}"]
    ax = scale_ax.xaxis if orientation == "horizontal" else scale_ax.yaxis
    ax.set_ticks(ticks)
    ax.set_ticklabels(ticklabels)
    display(scale_fig)

    for ext in file_extensions:
        fname = os.path.join(logoplot_subdir, f"scalebar_{orientation}{ext}")
        print(f"Saving to {fname}")
        scale_fig.savefig(fname)
    
    plt.close(scale_fig)

escape["letter_color"] = colormap.val_to_color(
    escape["cell entry"].clip(lower=colormap.minvalue, upper=colormap.maxvalue)
)

Saving to results/escape_logos/scalebar_vertical.svg
Saving to results/escape_logos/scalebar_vertical.pdf

Saving to results/escape_logos/scalebar_horizontal.svg
Saving to results/escape_logos/scalebar_horizontal.pdf

for antibody_group, antibodies_to_plot in (
    antibody_groups | {a: [a] for a in antibodies}
).items():
    print(f"\nMaking plot for {antibody_group=} with {antibodies_to_plot=}")
    df = (
        escape
        .query("antibody in @antibodies_to_plot")
        .assign(highlight=lambda x: x.groupby("site")["highlight"].transform("any"))
    )
    fig, axes = dmslogo.facet_plot(
        df,
        x_col="sequential_site",
        show_col="highlight",
        gridrow_col="antibody",
        draw_line_kwargs={
            "height_col": "positive_site_escape",
            "height_col2": None if positive_escape_only else "negative_site_escape",
            "xtick_col": "site",
            "ylabel": "escape",
            "widthscale": 0.3,
        },
        draw_logo_kwargs={
            "letter_col": "mutant",
            "letter_height_col": "escape",
            "xtick_col": "site",
            "color_col": "letter_color",
            "widthscale": 0.7,
        },
        share_xlabel=True,
        share_ylabel=True,
        height_per_ax=1.9,
        wspace=0.6,
        share_ylim_across_rows=False,
    )

    # adjust x-ticks on line plots
    assert axes.shape[1] == 2
    for i in range(axes.shape[0]):
        axes[i, 0].xaxis.set_ticks(lineplot_xticks)
        if i == axes.shape[0] - 1:
            axes[i, 0].xaxis.set_ticklabels(lineplot_xticks)
        

    for ext in file_extensions:
        fname = os.path.join(logoplot_subdir, f"{antibody_group}{ext}")
        print(f"Saving to {fname}")
        fig.savefig(fname)

    display(fig)
    plt.close(fig)

Making plot for antibody_group='region_3_antibodies' with antibodies_to_plot=['17C7', 'CR4098', 'RVA122', 'RVC58']

Saving to results/escape_logos/region_3_antibodies.svg

Saving to results/escape_logos/region_3_antibodies.pdf

Making plot for antibody_group='region_1_antibodies' with antibodies_to_plot=['RVC20', 'CR57']

Saving to results/escape_logos/region_1_antibodies.svg

Saving to results/escape_logos/region_1_antibodies.pdf

Making plot for antibody_group='17C7' with antibodies_to_plot=['17C7']

Saving to results/escape_logos/17C7.svg
Saving to results/escape_logos/17C7.pdf

Making plot for antibody_group='CR4098' with antibodies_to_plot=['CR4098']

Saving to results/escape_logos/CR4098.svg
Saving to results/escape_logos/CR4098.pdf

	n_sites_to_highlight
antibody
17C7	7
CR4098	8
CR57	7
CTB012	24
RVA122	8
RVC20	7
RVC58	7
RVC68	6

Make logoplots showing key sites of escape for each antibody¶