In [1]:
######## snakemake preamble start (automatically inserted, do not edit) ########
import sys; sys.path.extend(['/home/tyu2/.conda/envs/dms-vep-pipeline-3/lib/python3.11/site-packages', '/fh/fast/bloom_j/computational_notebooks/tyu2/2024/Flu_H3_Massachusetts2022_DMS/dms-vep-pipeline-3/..', '/fh/fast/bloom_j/computational_notebooks/tyu2/2024/Flu_H3_Massachusetts2022_DMS/dms-vep-pipeline-3', '/home/tyu2/.conda/envs/dms-vep-pipeline-3/bin', '/home/tyu2/.conda/envs/dms-vep-pipeline-3/lib/python3.11', '/home/tyu2/.conda/envs/dms-vep-pipeline-3/lib/python3.11/lib-dynload', '/home/tyu2/.conda/envs/dms-vep-pipeline-3/lib/python3.11/site-packages', '/home/tyu2/.cache/snakemake/snakemake/source-cache/runtime-cache/tmptmbv20cp/file/fh/fast/bloom_j/computational_notebooks/tyu2/2024/Flu_H3_Massachusetts2022_DMS/analysis', '/fh/fast/bloom_j/computational_notebooks/tyu2/2024/Flu_H3_Massachusetts2022_DMS/analysis']); import pickle; snakemake = pickle.loads(b"\x80\x04\x95\x17\x0c\x00\x00\x00\x00\x00\x00\x8c\x10snakemake.script\x94\x8c\tSnakemake\x94\x93\x94)\x81\x94}\x94(\x8c\x05input\x94\x8c\x0csnakemake.io\x94\x8c\nInputFiles\x94\x93\x94)\x81\x94(\x8c.analysis/results/h3n2_ha_60y_phenotypes_df.csv\x94\x8c\x1bdata/site_numbering_map.csv\x94e}\x94(\x8c\x06_names\x94}\x94(\x8c\x14phenotypes_and_freqs\x94K\x00N\x86\x94\x8c\x12site_numbering_map\x94K\x01N\x86\x94u\x8c\x12_allowed_overrides\x94]\x94(\x8c\x05index\x94\x8c\x04sort\x94eh\x15\x8c\tfunctools\x94\x8c\x07partial\x94\x93\x94h\x06\x8c\x19Namedlist._used_attribute\x94\x93\x94\x85\x94R\x94(h\x1b)}\x94\x8c\x05_name\x94h\x15sNt\x94bh\x16h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x16sNt\x94bh\x0fh\nh\x11h\x0bub\x8c\x06output\x94h\x06\x8c\x0bOutputFiles\x94\x93\x94)\x81\x94(\x8c&results/entrenchment/entrenchment.html\x94\x8c$results/notebooks/entrenchment.ipynb\x94e}\x94(h\r}\x94(\x8c\nchart_html\x94K\x00N\x86\x94\x8c\x02nb\x94K\x01N\x86\x94uh\x13]\x94(h\x15h\x16eh\x15h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x15sNt\x94bh\x16h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x16sNt\x94bh-h)h/h*ub\x8c\x06params\x94h\x06\x8c\x06Params\x94\x93\x94)\x81\x94}\x94(h\r}\x94h\x13]\x94(h\x15h\x16eh\x15h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x15sNt\x94bh\x16h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x16sNt\x94bub\x8c\twildcards\x94h\x06\x8c\tWildcards\x94\x93\x94)\x81\x94}\x94(h\r}\x94h\x13]\x94(h\x15h\x16eh\x15h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x15sNt\x94bh\x16h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x16sNt\x94bub\x8c\x07threads\x94K\x01\x8c\tresources\x94h\x06\x8c\tResources\x94\x93\x94)\x81\x94(K\x01K\x01\x8c\x04/tmp\x94e}\x94(h\r}\x94(\x8c\x06_cores\x94K\x00N\x86\x94\x8c\x06_nodes\x94K\x01N\x86\x94\x8c\x06tmpdir\x94K\x02N\x86\x94uh\x13]\x94(h\x15h\x16eh\x15h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x15sNt\x94bh\x16h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x16sNt\x94bh`K\x01hbK\x01hdh]ub\x8c\x03log\x94h\x06\x8c\x03Log\x94\x93\x94)\x81\x94\x8c$results/notebooks/entrenchment.ipynb\x94a}\x94(h\r}\x94\x8c\x08notebook\x94K\x00N\x86\x94sh\x13]\x94(h\x15h\x16eh\x15h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x15sNt\x94bh\x16h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x16sNt\x94bhvhsub\x8c\x06config\x94}\x94(\x8c\x08homepage\x94\x8c\x0fhomepage/public\x94\x8c\x18build_vitepress_homepage\x94\x88\x8c\rpipeline_path\x94\x8c\x12dms-vep-pipeline-3\x94\x8c\x04docs\x94\x8c\x04docs\x94\x8c\x0fgithub_repo_url\x94\x8c7https://github.com/dms-vep/Flu_H3_Massachusetts2022_DMS\x94\x8c\x0fgithub_blob_url\x94\x8cAhttps://github.com/dms-vep/Flu_H3_Massachusetts2022_DMS/blob/main\x94\x8c\x0bdescription\x94\x8cHDeep mutational scanning of A/Massachusetts/18/2022 (H3N2) hemagglutinin\x94\x8c\x04year\x94M\xe9\x07\x8c\x07authors\x94\x8c>[Yu et al](https://www.nature.com/articles/s41559-025-02895-1)\x94\x8c\x12site_numbering_map\x94\x8c\x1bdata/site_numbering_map.csv\x94\x8c\x14mutation_annotations\x94\x8c5results/mutation_annotations/mutation_annotations.csv\x94\x8c\x1emutation_design_classification\x94}\x94(\x8c\x03csv\x94\x8c'data/mutation_design_classification.csv\x94\x8c\x08site_col\x94\x8c\x0fsequential_site\x94u\x8c\x16neut_standard_barcodes\x94\x8c)data/neutralization_standard_barcodes.csv\x94\x8c\x11prebuilt_variants\x94N\x8c\x10prebuilt_geneseq\x94N\x8c\x0bpacbio_runs\x94\x8c\x14data/PacBio_runs.csv\x94\x8c\x0fpacbio_amplicon\x94\x8c\x17data/PacBio_amplicon.gb\x94\x8c\x15pacbio_amplicon_specs\x94\x8c$data/PacBio_feature_parse_specs.yaml\x94\x8c\x0cvariant_tags\x94}\x94(\x8c\x0cvariant_tag5\x94}\x94(\x8c\tvariant_1\x94\x8c\x01G\x94\x8c\tvariant_2\x94\x8c\x01C\x94\x8c\x08wildtype\x94\x8c\x01A\x94u\x8c\x0cvariant_tag3\x94}\x94(\x8c\tvariant_1\x94h\xac\x8c\tvariant_2\x94h\xae\x8c\x08wildtype\x94h\xb0uu\x8c\x12max_ccs_error_rate\x94G?\x1a6\xe2\xeb\x1cC-\x8c\x10consensus_params\x94}\x94(\x8c\rmax_sub_diffs\x94N\x8c\x0fmax_indel_diffs\x94N\x8c\x12max_minor_sub_frac\x94G?\xc9\x99\x99\x99\x99\x99\x9a\x8c\x14max_minor_indel_frac\x94G?\xc9\x99\x99\x99\x99\x99\x9a\x8c\x0bmin_support\x94K\x03u\x8c\x13gene_sequence_codon\x94\x8c!results/gene_sequence/codon.fasta\x94\x8c\x15gene_sequence_protein\x94\x8c#results/gene_sequence/protein.fasta\x94\x8c\x0ecodon_variants\x94\x8c#results/variants/codon_variants.csv\x94\x8c\x0cbarcode_runs\x94\x8c\x15data/barcode_runs.csv\x94\x8c\x1euse_precomputed_barcode_counts\x94\x89\x8c\x1eillumina_barcode_parser_params\x94}\x94(\x8c\x08upstream\x94\x8c#AACTCCACTAGGAACATTTCTCTCTCGAATCTAGA\x94\x8c\ndownstream\x94\x8c\x00\x94\x8c\x04minq\x94K\x14\x8c\x11upstream_mismatch\x94K\x02u\x8c\x13func_effects_config\x94\x8c\x1cdata/func_effects_config.yml\x94\x8c\x16antibody_escape_config\x94\x8c\x1fdata/antibody_escape_config.yml\x94\x8c\x10summaries_config\x94\x8c\x19data/summaries_config.yml\x94u\x8c\x04rule\x94\x8c\x15entrenchment_analysis\x94\x8c\x0fbench_iteration\x94N\x8c\tscriptdir\x94\x8cX/fh/fast/bloom_j/computational_notebooks/tyu2/2024/Flu_H3_Massachusetts2022_DMS/analysis\x94ub."); from snakemake.logging import logger; logger.printshellcmds = False; import os; os.chdir(r'/fh/fast/bloom_j/computational_notebooks/tyu2/2024/Flu_H3_Massachusetts2022_DMS');
######## snakemake preamble end #########

Entrenchment analysis¶

In [2]:
import os

import pandas as pd 
import altair as alt 
import numpy as np
import scipy
import theme

alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')

alt.data_transformers.disable_max_rows()
Out[2]:
DataTransformerRegistry.enable('default')
In [3]:
#phenotypes_df = pd.read_csv('results/h3n2_ha_60y_phenotypes_df.csv')
phenotypes_df = pd.read_csv(snakemake.input.phenotypes_and_freqs)

# site_map = pd.read_csv('../data/site_numbering_map.csv')
site_map = pd.read_csv(snakemake.input.site_numbering_map)

phenotypes_df = pd.merge(
    phenotypes_df,
    site_map,
    left_on=['site', 'sequential_site', 'wildtype', 'region'], 
    right_on=['reference_site', 'sequential_site', 'sequential_wt', 'region'], 
).drop(
    columns=['sequential_site', 'reference_site', 'sequential_wt']
).assign(
    appeared=phenotypes_df['most_recent_fix_date'].notna(),
    stability_measured=phenotypes_df['pH stability'].notna(),
    in_rbs=lambda x: x['rbs_region'].apply(
        lambda r: "Inside receptor binding pocket" if r != "outside RBS" else "Outside receptor binding pocket"
    )
)
In [4]:
def plot_phenotype_vs_date(data, phenotype, phenotype_title, colors, show_x=True, show_rbs_label=True):
    rbs_colors = {
        "Inside receptor binding pocket": colors[0],
        "Outside receptor binding pocket": colors[1],
    }

    y_min = data[phenotype].min()
    y_max = data[phenotype].max()

    # x_min = pd.to_datetime(data['most_recent_fix_date']).min()
    # x_max = pd.to_datetime(data['most_recent_fix_date']).max()

    # Ensure date is in datetime format
    data['most_recent_fix_date'] = pd.to_datetime(data['most_recent_fix_date'])

    # sort data to control which points overlay others
    data = data.sort_values('site', ascending=False)

    x_axis = alt.Axis(
        title=None if not show_x else ["Most recent date when", "amino acid was fixed"],
        labels=show_x,
        ticks=show_x,
        tickCount=5,
        domain=show_x,
        grid=True,
        gridColor='white',
        gridWidth=1.5
    )

    # Create base chart without data specification
    base = alt.Chart().encode(
        x=alt.X(
            "most_recent_fix_date:T",
            axis=x_axis
        ),
        y=alt.Y(
            phenotype,
            title=phenotype_title,
            scale=alt.Scale(
                domain=[y_min, y_max]
            ),
            axis=alt.Axis(
                grid=True,
                gridColor='white',
                gridWidth=1.5
            )
        ),
        color=alt.Color(
            "in_rbs",
            scale=alt.Scale(domain=list(rbs_colors.keys()), range=list(rbs_colors.values())),
            title=None,
            legend=None
        ),
        tooltip=[
            'site', 'wildtype', 'mutant', 'region', 'max_frequency', phenotype,
            'pH stability', 'rbs_region', 'most_recent_fix_date'
        ]
    )

    # Create horizontal line specification
    hline = alt.Chart().mark_rule(
        color='black',
        size=1.25,
        opacity=1.0,
        strokeDash=[6,6]
    ).encode(y=alt.Y(datum=0))

    # Layer everything together first
    complete_layer = alt.layer(
        base.mark_circle(size=60, opacity=1, stroke='black', strokeWidth=0.5),  # scatter plot
        hline,
        #base.transform_regression(
        #    'most_recent_fix_date',
        #    phenotype,
        #    groupby=['in_rbs'],
        #).mark_line(size=3),
    ).properties(
        width=300,
        height=150
    )

    header_kwargs = {
        'labelFontSize': 16,
        'labelFontWeight': 'bold',
    }

    if not show_rbs_label:
        header_kwargs['labelExpr'] = "''"

    # Apply faceting with data specification
    complete_chart = complete_layer.facet(
        data=data.query('mutant != wildtype and most_recent_fix_date.notna()'),
        facet=alt.Facet(
            'in_rbs',
            title=None,
            header=alt.Header(**header_kwargs)
        ),
        columns=2
    ).resolve_scale(
        y='shared',
        x='shared'
    )

    return complete_chart
In [5]:
entry_chart = plot_phenotype_vs_date(
    phenotypes_df, 
    'MDCKSIAT1 cell entry', 
    ['Effect on cell entry in', 'MA22 background'],
    ['#E41A1C', '#FFC1C3'],
    show_x=False
) 

stability_chart = plot_phenotype_vs_date(
    phenotypes_df, 
    'pH stability', 
    ['Effect on stability in', 'MA22 background'],
    ['#377EB8', '#C6DBEF'],
    show_rbs_label=False
) 

entrenchment_chart = (entry_chart & stability_chart).resolve_scale(
    color='independent',
    x='shared'
).configure_view(
    fill='#F1F1F1' # panel background
)

print(f"Saving {snakemake.output.chart_html=}")
entrenchment_chart.save(snakemake.output.chart_html)

entrenchment_chart
Saving snakemake.output.chart_html='results/entrenchment/entrenchment.html'
Out[5]: