In [1]:
######## snakemake preamble start (automatically inserted, do not edit) ########
import sys; sys.path.extend(['/home/tyu2/.conda/envs/dms-vep-pipeline-3/lib/python3.11/site-packages', '/fh/fast/bloom_j/computational_notebooks/tyu2/2024/Flu_H3_Massachusetts2022_DMS/dms-vep-pipeline-3/..', '/fh/fast/bloom_j/computational_notebooks/tyu2/2024/Flu_H3_Massachusetts2022_DMS/dms-vep-pipeline-3', '/home/tyu2/.conda/envs/dms-vep-pipeline-3/bin', '/home/tyu2/.conda/envs/dms-vep-pipeline-3/lib/python3.11', '/home/tyu2/.conda/envs/dms-vep-pipeline-3/lib/python3.11/lib-dynload', '/home/tyu2/.conda/envs/dms-vep-pipeline-3/lib/python3.11/site-packages', '/home/tyu2/.cache/snakemake/snakemake/source-cache/runtime-cache/tmptmbv20cp/file/fh/fast/bloom_j/computational_notebooks/tyu2/2024/Flu_H3_Massachusetts2022_DMS/analysis', '/fh/fast/bloom_j/computational_notebooks/tyu2/2024/Flu_H3_Massachusetts2022_DMS/analysis']); import pickle; snakemake = pickle.loads(b"\x80\x04\x95\x17\x0c\x00\x00\x00\x00\x00\x00\x8c\x10snakemake.script\x94\x8c\tSnakemake\x94\x93\x94)\x81\x94}\x94(\x8c\x05input\x94\x8c\x0csnakemake.io\x94\x8c\nInputFiles\x94\x93\x94)\x81\x94(\x8c.analysis/results/h3n2_ha_60y_phenotypes_df.csv\x94\x8c\x1bdata/site_numbering_map.csv\x94e}\x94(\x8c\x06_names\x94}\x94(\x8c\x14phenotypes_and_freqs\x94K\x00N\x86\x94\x8c\x12site_numbering_map\x94K\x01N\x86\x94u\x8c\x12_allowed_overrides\x94]\x94(\x8c\x05index\x94\x8c\x04sort\x94eh\x15\x8c\tfunctools\x94\x8c\x07partial\x94\x93\x94h\x06\x8c\x19Namedlist._used_attribute\x94\x93\x94\x85\x94R\x94(h\x1b)}\x94\x8c\x05_name\x94h\x15sNt\x94bh\x16h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x16sNt\x94bh\x0fh\nh\x11h\x0bub\x8c\x06output\x94h\x06\x8c\x0bOutputFiles\x94\x93\x94)\x81\x94(\x8c&results/entrenchment/entrenchment.html\x94\x8c$results/notebooks/entrenchment.ipynb\x94e}\x94(h\r}\x94(\x8c\nchart_html\x94K\x00N\x86\x94\x8c\x02nb\x94K\x01N\x86\x94uh\x13]\x94(h\x15h\x16eh\x15h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x15sNt\x94bh\x16h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x16sNt\x94bh-h)h/h*ub\x8c\x06params\x94h\x06\x8c\x06Params\x94\x93\x94)\x81\x94}\x94(h\r}\x94h\x13]\x94(h\x15h\x16eh\x15h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x15sNt\x94bh\x16h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x16sNt\x94bub\x8c\twildcards\x94h\x06\x8c\tWildcards\x94\x93\x94)\x81\x94}\x94(h\r}\x94h\x13]\x94(h\x15h\x16eh\x15h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x15sNt\x94bh\x16h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x16sNt\x94bub\x8c\x07threads\x94K\x01\x8c\tresources\x94h\x06\x8c\tResources\x94\x93\x94)\x81\x94(K\x01K\x01\x8c\x04/tmp\x94e}\x94(h\r}\x94(\x8c\x06_cores\x94K\x00N\x86\x94\x8c\x06_nodes\x94K\x01N\x86\x94\x8c\x06tmpdir\x94K\x02N\x86\x94uh\x13]\x94(h\x15h\x16eh\x15h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x15sNt\x94bh\x16h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x16sNt\x94bh`K\x01hbK\x01hdh]ub\x8c\x03log\x94h\x06\x8c\x03Log\x94\x93\x94)\x81\x94\x8c$results/notebooks/entrenchment.ipynb\x94a}\x94(h\r}\x94\x8c\x08notebook\x94K\x00N\x86\x94sh\x13]\x94(h\x15h\x16eh\x15h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x15sNt\x94bh\x16h\x19h\x1b\x85\x94R\x94(h\x1b)}\x94h\x1fh\x16sNt\x94bhvhsub\x8c\x06config\x94}\x94(\x8c\x08homepage\x94\x8c\x0fhomepage/public\x94\x8c\x18build_vitepress_homepage\x94\x88\x8c\rpipeline_path\x94\x8c\x12dms-vep-pipeline-3\x94\x8c\x04docs\x94\x8c\x04docs\x94\x8c\x0fgithub_repo_url\x94\x8c7https://github.com/dms-vep/Flu_H3_Massachusetts2022_DMS\x94\x8c\x0fgithub_blob_url\x94\x8cAhttps://github.com/dms-vep/Flu_H3_Massachusetts2022_DMS/blob/main\x94\x8c\x0bdescription\x94\x8cHDeep mutational scanning of A/Massachusetts/18/2022 (H3N2) hemagglutinin\x94\x8c\x04year\x94M\xe9\x07\x8c\x07authors\x94\x8c>[Yu et al](https://www.nature.com/articles/s41559-025-02895-1)\x94\x8c\x12site_numbering_map\x94\x8c\x1bdata/site_numbering_map.csv\x94\x8c\x14mutation_annotations\x94\x8c5results/mutation_annotations/mutation_annotations.csv\x94\x8c\x1emutation_design_classification\x94}\x94(\x8c\x03csv\x94\x8c'data/mutation_design_classification.csv\x94\x8c\x08site_col\x94\x8c\x0fsequential_site\x94u\x8c\x16neut_standard_barcodes\x94\x8c)data/neutralization_standard_barcodes.csv\x94\x8c\x11prebuilt_variants\x94N\x8c\x10prebuilt_geneseq\x94N\x8c\x0bpacbio_runs\x94\x8c\x14data/PacBio_runs.csv\x94\x8c\x0fpacbio_amplicon\x94\x8c\x17data/PacBio_amplicon.gb\x94\x8c\x15pacbio_amplicon_specs\x94\x8c$data/PacBio_feature_parse_specs.yaml\x94\x8c\x0cvariant_tags\x94}\x94(\x8c\x0cvariant_tag5\x94}\x94(\x8c\tvariant_1\x94\x8c\x01G\x94\x8c\tvariant_2\x94\x8c\x01C\x94\x8c\x08wildtype\x94\x8c\x01A\x94u\x8c\x0cvariant_tag3\x94}\x94(\x8c\tvariant_1\x94h\xac\x8c\tvariant_2\x94h\xae\x8c\x08wildtype\x94h\xb0uu\x8c\x12max_ccs_error_rate\x94G?\x1a6\xe2\xeb\x1cC-\x8c\x10consensus_params\x94}\x94(\x8c\rmax_sub_diffs\x94N\x8c\x0fmax_indel_diffs\x94N\x8c\x12max_minor_sub_frac\x94G?\xc9\x99\x99\x99\x99\x99\x9a\x8c\x14max_minor_indel_frac\x94G?\xc9\x99\x99\x99\x99\x99\x9a\x8c\x0bmin_support\x94K\x03u\x8c\x13gene_sequence_codon\x94\x8c!results/gene_sequence/codon.fasta\x94\x8c\x15gene_sequence_protein\x94\x8c#results/gene_sequence/protein.fasta\x94\x8c\x0ecodon_variants\x94\x8c#results/variants/codon_variants.csv\x94\x8c\x0cbarcode_runs\x94\x8c\x15data/barcode_runs.csv\x94\x8c\x1euse_precomputed_barcode_counts\x94\x89\x8c\x1eillumina_barcode_parser_params\x94}\x94(\x8c\x08upstream\x94\x8c#AACTCCACTAGGAACATTTCTCTCTCGAATCTAGA\x94\x8c\ndownstream\x94\x8c\x00\x94\x8c\x04minq\x94K\x14\x8c\x11upstream_mismatch\x94K\x02u\x8c\x13func_effects_config\x94\x8c\x1cdata/func_effects_config.yml\x94\x8c\x16antibody_escape_config\x94\x8c\x1fdata/antibody_escape_config.yml\x94\x8c\x10summaries_config\x94\x8c\x19data/summaries_config.yml\x94u\x8c\x04rule\x94\x8c\x15entrenchment_analysis\x94\x8c\x0fbench_iteration\x94N\x8c\tscriptdir\x94\x8cX/fh/fast/bloom_j/computational_notebooks/tyu2/2024/Flu_H3_Massachusetts2022_DMS/analysis\x94ub."); from snakemake.logging import logger; logger.printshellcmds = False; import os; os.chdir(r'/fh/fast/bloom_j/computational_notebooks/tyu2/2024/Flu_H3_Massachusetts2022_DMS');
######## snakemake preamble end #########
Entrenchment analysis¶
In [2]:
import os
import pandas as pd
import altair as alt
import numpy as np
import scipy
import theme
alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')
alt.data_transformers.disable_max_rows()
Out[2]:
DataTransformerRegistry.enable('default')
In [3]:
#phenotypes_df = pd.read_csv('results/h3n2_ha_60y_phenotypes_df.csv')
phenotypes_df = pd.read_csv(snakemake.input.phenotypes_and_freqs)
# site_map = pd.read_csv('../data/site_numbering_map.csv')
site_map = pd.read_csv(snakemake.input.site_numbering_map)
phenotypes_df = pd.merge(
phenotypes_df,
site_map,
left_on=['site', 'sequential_site', 'wildtype', 'region'],
right_on=['reference_site', 'sequential_site', 'sequential_wt', 'region'],
).drop(
columns=['sequential_site', 'reference_site', 'sequential_wt']
).assign(
appeared=phenotypes_df['most_recent_fix_date'].notna(),
stability_measured=phenotypes_df['pH stability'].notna(),
in_rbs=lambda x: x['rbs_region'].apply(
lambda r: "Inside receptor binding pocket" if r != "outside RBS" else "Outside receptor binding pocket"
)
)
In [4]:
def plot_phenotype_vs_date(data, phenotype, phenotype_title, colors, show_x=True, show_rbs_label=True):
rbs_colors = {
"Inside receptor binding pocket": colors[0],
"Outside receptor binding pocket": colors[1],
}
y_min = data[phenotype].min()
y_max = data[phenotype].max()
# x_min = pd.to_datetime(data['most_recent_fix_date']).min()
# x_max = pd.to_datetime(data['most_recent_fix_date']).max()
# Ensure date is in datetime format
data['most_recent_fix_date'] = pd.to_datetime(data['most_recent_fix_date'])
# sort data to control which points overlay others
data = data.sort_values('site', ascending=False)
x_axis = alt.Axis(
title=None if not show_x else ["Most recent date when", "amino acid was fixed"],
labels=show_x,
ticks=show_x,
tickCount=5,
domain=show_x,
grid=True,
gridColor='white',
gridWidth=1.5
)
# Create base chart without data specification
base = alt.Chart().encode(
x=alt.X(
"most_recent_fix_date:T",
axis=x_axis
),
y=alt.Y(
phenotype,
title=phenotype_title,
scale=alt.Scale(
domain=[y_min, y_max]
),
axis=alt.Axis(
grid=True,
gridColor='white',
gridWidth=1.5
)
),
color=alt.Color(
"in_rbs",
scale=alt.Scale(domain=list(rbs_colors.keys()), range=list(rbs_colors.values())),
title=None,
legend=None
),
tooltip=[
'site', 'wildtype', 'mutant', 'region', 'max_frequency', phenotype,
'pH stability', 'rbs_region', 'most_recent_fix_date'
]
)
# Create horizontal line specification
hline = alt.Chart().mark_rule(
color='black',
size=1.25,
opacity=1.0,
strokeDash=[6,6]
).encode(y=alt.Y(datum=0))
# Layer everything together first
complete_layer = alt.layer(
base.mark_circle(size=60, opacity=1, stroke='black', strokeWidth=0.5), # scatter plot
hline,
#base.transform_regression(
# 'most_recent_fix_date',
# phenotype,
# groupby=['in_rbs'],
#).mark_line(size=3),
).properties(
width=300,
height=150
)
header_kwargs = {
'labelFontSize': 16,
'labelFontWeight': 'bold',
}
if not show_rbs_label:
header_kwargs['labelExpr'] = "''"
# Apply faceting with data specification
complete_chart = complete_layer.facet(
data=data.query('mutant != wildtype and most_recent_fix_date.notna()'),
facet=alt.Facet(
'in_rbs',
title=None,
header=alt.Header(**header_kwargs)
),
columns=2
).resolve_scale(
y='shared',
x='shared'
)
return complete_chart
In [5]:
entry_chart = plot_phenotype_vs_date(
phenotypes_df,
'MDCKSIAT1 cell entry',
['Effect on cell entry in', 'MA22 background'],
['#E41A1C', '#FFC1C3'],
show_x=False
)
stability_chart = plot_phenotype_vs_date(
phenotypes_df,
'pH stability',
['Effect on stability in', 'MA22 background'],
['#377EB8', '#C6DBEF'],
show_rbs_label=False
)
entrenchment_chart = (entry_chart & stability_chart).resolve_scale(
color='independent',
x='shared'
).configure_view(
fill='#F1F1F1' # panel background
)
print(f"Saving {snakemake.output.chart_html=}")
entrenchment_chart.save(snakemake.output.chart_html)
entrenchment_chart
Saving snakemake.output.chart_html='results/entrenchment/entrenchment.html'
Out[5]: