# Parameters - will be overridden by papermill
strain = "RSV-A"
variations_with_effects_file = "results/sequence_variation/RSV-A_sequence_variations_with_effects.csv"
output_html = "results/sequence_variation/RSV-A_sequence_variation_effects.html"
analysis_description = "Differences from DMS Long Strain"
mutation_identified_relative_to = "RSV_Long_F"
effects_calculated_relative_to = "RSV_Long_F"

# Parameters
strain = "RSV-A"
variations_with_effects_file = (
    "results/sequence_variation/RSV-A_dms_ref_sequence_variations_with_effects.csv"
)
output_html = "results/sequence_variation/RSV-A_dms_ref_sequence_variation_effects.html"
analysis_description = "Differences from Strain Used in Deep Mutational Scanning"
mutation_identified_relative_to = "RSV_Long_F"
effects_calculated_relative_to = "RSV_Long_F"

import pandas as pd
import altair as alt
from Bio import SeqIO

_ = alt.data_transformers.disable_max_rows()

print(f"Analyzing strain: {strain}")
print(f"Analysis type: {analysis_description}")
print(f"Variations with effects: {variations_with_effects_file}")
print(f"Differences identified relative to: {mutation_identified_relative_to}")
print(f"Effects calculated relative to: {effects_calculated_relative_to}")
print(f"Output HTML: {output_html}")

Analyzing strain: RSV-A
Analysis type: Differences from Strain Used in Deep Mutational Scanning
Variations with effects: results/sequence_variation/RSV-A_dms_ref_sequence_variations_with_effects.csv
Differences identified relative to: RSV_Long_F
Effects calculated relative to: RSV_Long_F
Output HTML: results/sequence_variation/RSV-A_dms_ref_sequence_variation_effects.html

# Load sequence variations with effects
with_effects = pd.read_csv(variations_with_effects_file)
print(f"Loaded {len(with_effects)} {strain} sequence variations")
print(f"At {with_effects['site'].nunique()} sites")

# Separate those with and without DMS data
with_dms = with_effects.dropna(subset=['cell entry'])
without_dms = with_effects[with_effects['cell entry'].isna()]

print(f"\nVariations with DMS data: {len(with_dms)}")
print(f"Variations without DMS data: {len(without_dms)}")
print(f"Coverage: {len(with_dms) / len(with_effects) * 100:.1f}% of variations have DMS data")

display(with_effects.head())

# Also load all possible DMS mutations for comparison in plots
# Extract cell_entry_file path from the pipeline
import os
cell_entry_file = "results/summaries/cell_entry.csv"
cell_entry = pd.read_csv(cell_entry_file)
print(f"\nLoaded {len(cell_entry)} total possible mutations from DMS data for comparison")

Loaded 234 RSV-A sequence variations
At 147 sites

Variations with DMS data: 148
Variations without DMS data: 86
Coverage: 63.2% of variations have DMS data

Loaded 9800 total possible mutations from DMS data for comparison

print(f"Cell Entry Effect Statistics for {strain} Sequence Variations")
print(f"Analysis: {analysis_description}")
print("=" * 60)

# Filter to only variations with DMS data for statistics
with_dms = with_effects.dropna(subset=['cell entry'])

print(f"Statistics based on {len(with_dms)} variations with DMS data:")
print(f"Mean effect: {with_dms['cell entry'].mean():.3f}")
print(f"Median effect: {with_dms['cell entry'].median():.3f}")
print(f"Std deviation: {with_dms['cell entry'].std():.3f}")
print(f"\nEffect range: {with_dms['cell entry'].min():.3f} to {with_dms['cell entry'].max():.3f}")

# Count by effect category
print("\nEffect Categories:")
print(f"  Beneficial (> 0.5): {(with_dms['cell entry'] > 0.5).sum()} ({(with_dms['cell entry'] > 0.5).sum() / len(with_dms) * 100:.1f}%)")
print(f"  Neutral (-0.5 to 0.5): {((with_dms['cell entry'] >= -0.5) & (with_dms['cell entry'] <= 0.5)).sum()} ({((with_dms['cell entry'] >= -0.5) & (with_dms['cell entry'] <= 0.5)).sum() / len(with_dms) * 100:.1f}%)")
print(f"  Deleterious (< -0.5): {(with_dms['cell entry'] < -0.5).sum()} ({(with_dms['cell entry'] < -0.5).sum() / len(with_dms) * 100:.1f}%)")

# Most tolerated variations
print("\nMost Tolerated Sequence Variations (highest cell entry):")
display(with_dms.nlargest(10, 'cell entry')[['site', 'mutation_type', 'cell entry', 'mutation_count', 'region']])

# Most deleterious variations
print("\nMost Deleterious Sequence Variations (lowest cell entry):")
display(with_dms.nsmallest(10, 'cell entry')[['site', 'mutation_type', 'cell entry', 'mutation_count', 'region']])

Cell Entry Effect Statistics for RSV-A Sequence Variations
Analysis: Differences from Strain Used in Deep Mutational Scanning
============================================================
Statistics based on 148 variations with DMS data:
Mean effect: -0.204
Median effect: -0.104
Std deviation: 0.652

Effect range: -3.351 to 1.399

Effect Categories:
  Beneficial (> 0.5): 13 (8.8%)
  Neutral (-0.5 to 0.5): 95 (64.2%)
  Deleterious (< -0.5): 40 (27.0%)

Most Tolerated Sequence Variations (highest cell entry):

Most Deleterious Sequence Variations (lowest cell entry):

# Plot 1: Distribution histogram with all possible mutations and observed variations on separate y-axes
# Filter to only variations with DMS data for plotting
with_dms = with_effects.dropna(subset=['cell entry'])

# Define shared bin parameters with explicit step size to ensure both histograms have same bin widths
# Using a fixed step size guarantees identical bin boundaries for both datasets
bin_params = alt.Bin(step=0.2)

# Background histogram: All possible DMS mutations (blue) - right y-axis
hist_all_mutations = alt.Chart(cell_entry).mark_bar(color='blue', opacity=0.6).encode(
    alt.X('cell entry:Q', bin=bin_params, title='Cell Entry Effect', axis=alt.Axis(values=[-6, -5, -4, -3, -2, -1, 0, 1, 2, 3])),
    alt.Y('count()', title='All possible mutations', axis=alt.Axis(titleColor='blue')),
    tooltip=['count()']
)

# Foreground histogram: Observed sequence variations (green) - left y-axis
hist_observed = alt.Chart(with_dms).mark_bar(color='green', opacity=0.7).encode(
    alt.X('cell entry:Q', bin=bin_params, title='Cell Entry Effect', axis=alt.Axis(values=[-6, -5, -4, -3, -2, -1, 0, 1, 2, 3])),
    alt.Y('count()', title='Mutations in natural sequences', axis=alt.Axis(titleColor='green')),
    tooltip=['count()']
)

# Add zero reference line
rule_zero = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='red', strokeDash=[5, 5], strokeWidth=0.5).encode(x='x:Q')

# Layer histograms with independent y-axes and resolve x scale to shared
chart1 = alt.layer(
    hist_all_mutations,
    hist_observed,
    rule_zero
).resolve_scale(
    y='independent',
    x='shared'
).properties(
    width=600,
    height=400,
    title={
        "text": f'{strain} {analysis_description}',
        "subtitle": f'Distribution of Cell Entry Effects | Blue: All Possible Mutations (n={len(cell_entry)}) | Green: Mutations in Natural Sequences (n={len(with_dms)})'
    }
)

chart1

# Plot 2: Scatter plot by position
# Use only variations with DMS data

# Calculate total number of sequences for percentage calculation
# Hard-code values based on strain (from FASTA file sequence counts)
total_sequences = 5045 if strain == "RSV-A" else 5156  # RSV-A: 5045, RSV-B: 5156
with_dms['mutation_percent'] = (with_dms['mutation_count'] / total_sequences) * 100

chart2 = alt.Chart(with_dms).mark_circle(opacity=0.7).encode(
    x=alt.X('site:Q', title='Site', axis=alt.Axis(grid=True, gridWidth=0.25, gridColor='gray', values=[1, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500])),
    y=alt.Y('cell entry:Q', title='Cell Entry Effect', axis=alt.Axis(grid=True, gridWidth=0.25, gridColor='gray')),
    size=alt.Size(
        'mutation_percent:Q',
        title=['Percent of', 'natural sequences', 'with mutation'],
        scale=alt.Scale(range=[20, 400]),
        legend=alt.Legend(
            orient='right',
            titleLimit=150,
            titleOrient='top',
            offset=10
        )
    ),
    tooltip=['site', 'mutation_type', 'wildtype', 'mutant', 'mutation_count', 'mutation_percent', 'cell entry']
).properties(
    width=700,
    height=400,
    title={
        "text": f'{strain}',
        "subtitle": f'{analysis_description} - Cell Entry Effects by Position'
    }
)

# Add zero line
zero_line = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='red', strokeDash=[5, 5], strokeWidth=0.5).encode(y='y:Q')

chart2 = chart2 + zero_line

chart2

/tmp/ipykernel_25925/2123015846.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  with_dms['mutation_percent'] = (with_dms['mutation_count'] / total_sequences) * 100

# Combine all charts vertically
combined_chart = alt.vconcat(
    chart1,
    chart2,
    spacing=40
).configure_view(
    strokeWidth=0
).configure_axis(
    labelFontSize=22,
    titleFontSize=28,
    domainWidth=0.5,
    tickWidth=0.5,
    gridWidth=0.5,
    tickSize=4,
    tickColor='black'
).configure_title(
    fontSize=30,
    anchor='middle',
    subtitleFontSize=20
).configure_legend(
    labelFontSize=18,
    titleFontSize=20,
    gradientStrokeWidth=0.5
)

# Save to HTML
combined_chart.save(output_html)
print(f"Saved interactive plots to: {output_html}")

combined_chart

Saved interactive plots to: results/sequence_variation/RSV-A_dms_ref_sequence_variation_effects.html

	site	wildtype	mutant	mutation_type	mutation_count	cell entry	sequential_site	region
0	2	E	D	E→D	219	NaN	NaN	NaN
1	3	L	S	L→S	21	NaN	NaN	NaN
2	3	L	F	L→F	4	NaN	NaN	NaN
3	4	P	L	P→L	34	NaN	NaN	NaN
4	4	P	S	P→S	5	NaN	NaN	NaN

	site	mutation_type	cell entry	mutation_count	region
197	516	V→I	1.3990	4	none
70	88	N→T	1.1530	7	region_0
201	519	G→V	0.9563	8	none
72	94	Q→E	0.9339	19	region_0
149	228	N→S	0.7754	5	none
150	228	N→T	0.7464	5	none
107	120	N→Y	0.6935	4	none
155	276	N→S	0.6924	3917	region_II
109	121	N→S	0.5973	7	none
74	100	T→S	0.5721	7	none

	site	mutation_type	cell entry	mutation_count	region
73	99	S→N	-3.351	5	none
142	171	L→P	-2.826	6	region_V
77	101	T→P	-2.730	5036	none
181	462	Q→R	-1.461	3	region_IV
176	385	D→N	-1.214	3	region_I
80	102	A→V	-1.188	9	none
79	101	T→S	-1.182	3	none
194	510	D→N	-1.124	3	none
148	213	R→S	-1.091	5027	region_0
203	524	N→S	-1.062	8	none

Plot Cell Entry Effects of RSV F mutations observed in natural sequences relative to the DMS strain "RSV Long"¶

Analysis Overview¶

Configuration¶

Load data¶

Summary statistics¶

Create interactive plots with Altair¶

Save combined chart to HTML¶