# Parameters - will be overridden by papermill
strain = "RSV-A"
variations_with_effects_file = "results/sequence_variation/RSV-A_sequence_variations_with_effects.csv"
output_html = "results/sequence_variation/RSV-A_sequence_variation_effects.html"
analysis_description = "Differences from DMS Long Strain"
mutation_identified_relative_to = "RSV_Long_F"
effects_calculated_relative_to = "RSV_Long_F"

# Parameters
strain = "RSV-B"
variations_with_effects_file = (
    "results/sequence_variation/RSV-B_dms_ref_sequence_variations_with_effects.csv"
)
output_html = "results/sequence_variation/RSV-B_dms_ref_sequence_variation_effects.html"
analysis_description = "Differences from Strain Used in Deep Mutational Scanning"
mutation_identified_relative_to = "RSV_Long_F"
effects_calculated_relative_to = "RSV_Long_F"

import pandas as pd
import altair as alt
from Bio import SeqIO

_ = alt.data_transformers.disable_max_rows()

print(f"Analyzing strain: {strain}")
print(f"Analysis type: {analysis_description}")
print(f"Variations with effects: {variations_with_effects_file}")
print(f"Differences identified relative to: {mutation_identified_relative_to}")
print(f"Effects calculated relative to: {effects_calculated_relative_to}")
print(f"Output HTML: {output_html}")

Analyzing strain: RSV-B
Analysis type: Differences from Strain Used in Deep Mutational Scanning
Variations with effects: results/sequence_variation/RSV-B_dms_ref_sequence_variations_with_effects.csv
Differences identified relative to: RSV_Long_F
Effects calculated relative to: RSV_Long_F
Output HTML: results/sequence_variation/RSV-B_dms_ref_sequence_variation_effects.html

# Load sequence variations with effects
with_effects = pd.read_csv(variations_with_effects_file)
print(f"Loaded {len(with_effects)} {strain} sequence variations")
print(f"At {with_effects['site'].nunique()} sites")

# Separate those with and without DMS data
with_dms = with_effects.dropna(subset=['cell entry'])
without_dms = with_effects[with_effects['cell entry'].isna()]

print(f"\nVariations with DMS data: {len(with_dms)}")
print(f"Variations without DMS data: {len(without_dms)}")
print(f"Coverage: {len(with_dms) / len(with_effects) * 100:.1f}% of variations have DMS data")

display(with_effects.head())

# Also load all possible DMS mutations for comparison in plots
# Extract cell_entry_file path from the pipeline
import os
cell_entry_file = "results/summaries/cell_entry.csv"
cell_entry = pd.read_csv(cell_entry_file)
print(f"\nLoaded {len(cell_entry)} total possible mutations from DMS data for comparison")

Loaded 220 RSV-B sequence variations
At 149 sites

Variations with DMS data: 154
Variations without DMS data: 66
Coverage: 70.0% of variations have DMS data

Loaded 9800 total possible mutations from DMS data for comparison

print(f"Cell Entry Effect Statistics for {strain} Sequence Variations")
print(f"Analysis: {analysis_description}")
print("=" * 60)

# Filter to only variations with DMS data for statistics
with_dms = with_effects.dropna(subset=['cell entry'])

print(f"Statistics based on {len(with_dms)} variations with DMS data:")
print(f"Mean effect: {with_dms['cell entry'].mean():.3f}")
print(f"Median effect: {with_dms['cell entry'].median():.3f}")
print(f"Std deviation: {with_dms['cell entry'].std():.3f}")
print(f"\nEffect range: {with_dms['cell entry'].min():.3f} to {with_dms['cell entry'].max():.3f}")

# Count by effect category
print("\nEffect Categories:")
print(f"  Beneficial (> 0.5): {(with_dms['cell entry'] > 0.5).sum()} ({(with_dms['cell entry'] > 0.5).sum() / len(with_dms) * 100:.1f}%)")
print(f"  Neutral (-0.5 to 0.5): {((with_dms['cell entry'] >= -0.5) & (with_dms['cell entry'] <= 0.5)).sum()} ({((with_dms['cell entry'] >= -0.5) & (with_dms['cell entry'] <= 0.5)).sum() / len(with_dms) * 100:.1f}%)")
print(f"  Deleterious (< -0.5): {(with_dms['cell entry'] < -0.5).sum()} ({(with_dms['cell entry'] < -0.5).sum() / len(with_dms) * 100:.1f}%)")

# Most tolerated variations
print("\nMost Tolerated Sequence Variations (highest cell entry):")
display(with_dms.nlargest(10, 'cell entry')[['site', 'mutation_type', 'cell entry', 'mutation_count', 'region']])

# Most deleterious variations
print("\nMost Deleterious Sequence Variations (lowest cell entry):")
display(with_dms.nsmallest(10, 'cell entry')[['site', 'mutation_type', 'cell entry', 'mutation_count', 'region']])

Cell Entry Effect Statistics for RSV-B Sequence Variations
Analysis: Differences from Strain Used in Deep Mutational Scanning
============================================================
Statistics based on 154 variations with DMS data:
Mean effect: -0.288
Median effect: -0.168
Std deviation: 0.774

Effect range: -3.351 to 1.399

Effect Categories:
  Beneficial (> 0.5): 13 (8.4%)
  Neutral (-0.5 to 0.5): 95 (61.7%)
  Deleterious (< -0.5): 46 (29.9%)

Most Tolerated Sequence Variations (highest cell entry):

Most Deleterious Sequence Variations (lowest cell entry):

# Plot 1: Distribution histogram with all possible mutations and observed variations on separate y-axes
# Filter to only variations with DMS data for plotting
with_dms = with_effects.dropna(subset=['cell entry'])

# Define shared bin parameters with explicit step size to ensure both histograms have same bin widths
# Using a fixed step size guarantees identical bin boundaries for both datasets
bin_params = alt.Bin(step=0.2)

# Background histogram: All possible DMS mutations (blue) - right y-axis
hist_all_mutations = alt.Chart(cell_entry).mark_bar(color='blue', opacity=0.6).encode(
    alt.X('cell entry:Q', bin=bin_params, title='Cell Entry Effect', axis=alt.Axis(values=[-6, -5, -4, -3, -2, -1, 0, 1, 2, 3])),
    alt.Y('count()', title='All possible mutations', axis=alt.Axis(titleColor='blue')),
    tooltip=['count()']
)

# Foreground histogram: Observed sequence variations (green) - left y-axis
hist_observed = alt.Chart(with_dms).mark_bar(color='green', opacity=0.7).encode(
    alt.X('cell entry:Q', bin=bin_params, title='Cell Entry Effect', axis=alt.Axis(values=[-6, -5, -4, -3, -2, -1, 0, 1, 2, 3])),
    alt.Y('count()', title='Mutations in natural sequences', axis=alt.Axis(titleColor='green')),
    tooltip=['count()']
)

# Add zero reference line
rule_zero = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='red', strokeDash=[5, 5], strokeWidth=0.5).encode(x='x:Q')

# Layer histograms with independent y-axes and resolve x scale to shared
chart1 = alt.layer(
    hist_all_mutations,
    hist_observed,
    rule_zero
).resolve_scale(
    y='independent',
    x='shared'
).properties(
    width=600,
    height=400,
    title={
        "text": f'{strain} {analysis_description}',
        "subtitle": f'Distribution of Cell Entry Effects | Blue: All Possible Mutations (n={len(cell_entry)}) | Green: Mutations in Natural Sequences (n={len(with_dms)})'
    }
)

chart1

# Plot 2: Scatter plot by position
# Use only variations with DMS data

# Calculate total number of sequences for percentage calculation
# Hard-code values based on strain (from FASTA file sequence counts)
total_sequences = 5045 if strain == "RSV-A" else 5156  # RSV-A: 5045, RSV-B: 5156
with_dms['mutation_percent'] = (with_dms['mutation_count'] / total_sequences) * 100

chart2 = alt.Chart(with_dms).mark_circle(opacity=0.7).encode(
    x=alt.X('site:Q', title='Site', axis=alt.Axis(grid=True, gridWidth=0.25, gridColor='gray', values=[1, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500])),
    y=alt.Y('cell entry:Q', title='Cell Entry Effect', axis=alt.Axis(grid=True, gridWidth=0.25, gridColor='gray')),
    size=alt.Size(
        'mutation_percent:Q',
        title=['Percent of', 'natural sequences', 'with mutation'],
        scale=alt.Scale(range=[20, 400]),
        legend=alt.Legend(
            orient='right',
            titleLimit=150,
            titleOrient='top',
            offset=10
        )
    ),
    tooltip=['site', 'mutation_type', 'wildtype', 'mutant', 'mutation_count', 'mutation_percent', 'cell entry']
).properties(
    width=700,
    height=400,
    title={
        "text": f'{strain}',
        "subtitle": f'{analysis_description} - Cell Entry Effects by Position'
    }
)

# Add zero line
zero_line = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='red', strokeDash=[5, 5], strokeWidth=0.5).encode(y='y:Q')

chart2 = chart2 + zero_line

chart2

/tmp/ipykernel_25924/2123015846.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  with_dms['mutation_percent'] = (with_dms['mutation_count'] / total_sequences) * 100

# Combine all charts vertically
combined_chart = alt.vconcat(
    chart1,
    chart2,
    spacing=40
).configure_view(
    strokeWidth=0
).configure_axis(
    labelFontSize=22,
    titleFontSize=28,
    domainWidth=0.5,
    tickWidth=0.5,
    gridWidth=0.5,
    tickSize=4,
    tickColor='black'
).configure_title(
    fontSize=30,
    anchor='middle',
    subtitleFontSize=20
).configure_legend(
    labelFontSize=18,
    titleFontSize=20,
    gradientStrokeWidth=0.5
)

# Save to HTML
combined_chart.save(output_html)
print(f"Saved interactive plots to: {output_html}")

combined_chart

Saved interactive plots to: results/sequence_variation/RSV-B_dms_ref_sequence_variation_effects.html

	site	wildtype	mutant	mutation_type	mutation_count	cell entry	sequential_site	region
0	2	E	G	E→G	7	NaN	NaN	NaN
1	3	L	S	L→S	4	NaN	NaN	NaN
2	3	L	V	L→V	3	NaN	NaN	NaN
3	4	P	L	P→L	5113	NaN	NaN	NaN
4	4	P	M	P→M	18	NaN	NaN	NaN

	site	mutation_type	cell entry	mutation_count	region
191	516	V→I	1.3990	3	none
51	68	K→Q	0.9148	12	region_0
48	67	N→T	0.9107	5135	region_0
200	529	T→V	0.8206	170	none
170	402	V→I	0.7811	5150	none
138	244	T→A	0.7755	3	none
135	228	N→S	0.7754	5153	none
144	276	N→S	0.6924	5035	region_II
172	419	K→E	0.6714	24	none
153	312	P→H	0.6659	121	region_III

	site	mutation_type	cell entry	mutation_count	region
64	99	S→N	-3.351	5154	none
120	172	L→P	-3.252	12	region_V
173	439	C→S	-3.240	3	region_IV
149	294	E→K	-2.861	9	region_V
174	440	D→Q	-2.813	3	region_IV
66	101	T→P	-2.730	5151	none
176	442	V→L	-1.780	3	region_IV
63	97	M→T	-1.747	5	none
55	76	V→A	-1.526	5	region_0
175	442	V→M	-1.396	3	region_IV

Plot Cell Entry Effects of RSV F mutations observed in natural sequences relative to the DMS strain "RSV Long"¶

Analysis Overview¶

Configuration¶

Load data¶

Summary statistics¶

Create interactive plots with Altair¶

Save combined chart to HTML¶