# Parameters - will be overridden by papermill
strain = "RSV-A"
alignment_file = "data/RSV_F_seqs/RSV-A_with_ref.fasta"
output_file = "results/sequence_variation/RSV-A_sequence_variations_with_effects.csv"
cell_entry_file = "results/summaries/cell_entry.csv"
min_mutation_count = 4
reference_name = "RSV_Long_F"  # Name of reference sequence in alignment
analysis_description = "Differences from Strain Used in Deep Mutational Scanning"
mutation_identified_relative_to = "RSV_Long_F"
effects_calculated_relative_to = "RSV_Long_F"

# Parameters
strain = "RSV-A"
alignment_file = "data/RSV_F_seqs/RSV-A_with_ref.fasta"
output_file = (
    "results/sequence_variation/RSV-A_dms_ref_sequence_variations_with_effects.csv"
)
cell_entry_file = "results/summaries/cell_entry.csv"
reference_name = "RSV_Long_F"
analysis_description = "Differences from Strain Used in Deep Mutational Scanning"
mutation_identified_relative_to = "RSV_Long_F"
effects_calculated_relative_to = "RSV_Long_F"
min_mutation_count = 3

import pandas as pd
import numpy as np
from Bio import SeqIO
from collections import Counter
import os

print(f"Analyzing strain: {strain}")
print(f"Analysis type: {analysis_description}")
print(f"Input alignment: {alignment_file}")
print(f"Reference sequence: {reference_name}")
print(f"Differences identified relative to: {mutation_identified_relative_to}")
print(f"Effects calculated relative to: {effects_calculated_relative_to}")
print(f"Cell entry data: {cell_entry_file}")
print(f"Output file: {output_file}")
print(f"Minimum mutation count threshold: {min_mutation_count}")

Analyzing strain: RSV-A
Analysis type: Differences from Strain Used in Deep Mutational Scanning
Input alignment: data/RSV_F_seqs/RSV-A_with_ref.fasta
Reference sequence: RSV_Long_F
Differences identified relative to: RSV_Long_F
Effects calculated relative to: RSV_Long_F
Cell entry data: results/summaries/cell_entry.csv
Output file: results/sequence_variation/RSV-A_dms_ref_sequence_variations_with_effects.csv
Minimum mutation count threshold: 3

def analyze_variable_sites_detailed(fasta_file, reference_name, min_mutation_count=5):
    """
    Analyze variable sites in a multiple sequence alignment.
    
    Parameters:
    -----------
    fasta_file : str
        Path to FASTA alignment file
    reference_name : str
        Name/ID of the reference sequence to use for comparison
    min_mutation_count : int
        Minimum number of sequences with a mutation for it to be reported
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with columns: site, wildtype, mutant, mutation_count, mutation_type
    """
    # Parse all sequences
    sequences = list(SeqIO.parse(fasta_file, 'fasta'))
    
    # Find the reference sequence by name
    reference = None
    for seq in sequences:
        if seq.id == reference_name:
            reference = seq
            break
    
    if reference is None:
        raise ValueError(f"Reference sequence '{reference_name}' not found in alignment")
    
    ref_name = reference.id
    ref_seq = str(reference.seq)
    
    print(f"Reference sequence: {ref_name}")
    print(f"Reference length: {len(ref_seq)} amino acids")
    print(f"Total sequences in alignment: {len(sequences)}")
    
    # Track mutations at each position
    variable_sites = []
    
    # Iterate through each position in the reference
    for pos in range(len(ref_seq)):
        site = pos + 1  # 1-based numbering
        ref_aa = ref_seq[pos]
        
        # Skip stop codons and gaps in reference
        if ref_aa == '*' or ref_aa == '-':
            continue
        
        # Collect all amino acids at this position across sequences
        mutations = []
        for seq in sequences:
            # Skip the reference itself
            if seq.id == reference_name:
                continue
                
            if pos < len(seq.seq):
                seq_aa = str(seq.seq[pos])
                # Only count if different from reference and not a stop or gap
                if seq_aa != ref_aa and seq_aa != '*' and seq_aa != '-':
                    mutations.append(seq_aa)
        
        # Count mutation frequencies
        if mutations:
            mutation_counts = Counter(mutations)
            
            # Report mutations meeting the threshold
            for mutant_aa, count in mutation_counts.items():
                if count >= min_mutation_count:
                    variable_sites.append({
                        'site': site,
                        'wildtype': ref_aa,
                        'mutant': mutant_aa,
                        'mutation_count': count,
                        'mutation_type': f"{ref_aa}→{mutant_aa}"
                    })
    
    # Create DataFrame and sort
    df = pd.DataFrame(variable_sites)
    if len(df) > 0:
        df = df.sort_values(['site', 'mutation_count'], ascending=[True, False])
    
    print(f"\nFound {len(df)} sequence variations at {df['site'].nunique()} sites")
    print(f"(appearing in at least {min_mutation_count} sequences)")
    
    return df

# Run analysis
variable_sites = analyze_variable_sites_detailed(
    alignment_file,
    reference_name,
    min_mutation_count=min_mutation_count
)

# Display first rows
print(f"\nFirst 20 sequence variations for {strain}:")
display(variable_sites.head(20))

Reference sequence: RSV_Long_F
Reference length: 574 amino acids
Total sequences in alignment: 5046

Found 234 sequence variations at 147 sites
(appearing in at least 3 sequences)

First 20 sequence variations for RSV-A:

# Load cell entry effects
cell_entry = pd.read_csv(cell_entry_file)
print(f"Loaded cell entry data: {len(cell_entry)} mutations")
print(f"Sites range: {cell_entry['site'].min()} - {cell_entry['site'].max()}")

# Build lookup dictionary: effects[site][amino_acid] = cell_entry_effect
effects = {}
site_info = {}  # Store sequential_site and region for each site

for _, row in cell_entry.iterrows():
    site = row['site']
    if site not in effects:
        effects[site] = {}
        site_info[site] = {
            'sequential_site': row['sequential_site'],
            'region': row['region']
        }
    effects[site][row['mutant']] = row['cell entry']

# Calculate effect for each sequence variation
variation_effects = []
variations_with_dms_data = 0

for _, var in variable_sites.iterrows():
    site = var['site']
    wt = var['wildtype']  # Wildtype from DMS strain
    mut = var['mutant']   # Amino acid in natural sequence
    
    # Check if we have DMS measurement for this mutant at this site
    if site in effects and mut in effects[site]:
        # Get the DMS cell entry effect for this mutation
        # (DMS effects are already normalized relative to wildtype)
        cell_entry_effect = effects[site][mut]
        variations_with_dms_data += 1
        
        variation_effects.append({
            'site': site,
            'wildtype': wt,
            'mutant': mut,
            'mutation_type': var['mutation_type'],
            'mutation_count': var['mutation_count'],
            'cell entry': cell_entry_effect,
            'sequential_site': site_info[site]['sequential_site'],
            'region': site_info[site]['region']
        })
    else:
        # Include variation even without DMS data, with NaN for effect columns
        variation_effects.append({
            'site': site,
            'wildtype': wt,
            'mutant': mut,
            'mutation_type': var['mutation_type'],
            'mutation_count': var['mutation_count'],
            'cell entry': np.nan,
            'sequential_site': site_info.get(site, {}).get('sequential_site', np.nan),
            'region': site_info.get(site, {}).get('region', pd.NA)
        })

with_effects = pd.DataFrame(variation_effects)

print(f"\nTotal {strain} sequence variations: {len(with_effects)}")
print(f"Variations with DMS data: {variations_with_dms_data}")
print(f"Variations without DMS data: {len(with_effects) - variations_with_dms_data}")
print(f"At {with_effects['site'].nunique()} sites")
print(f"\nCoverage: {variations_with_dms_data / len(with_effects) * 100:.1f}% of sequence variations have DMS data")

Loaded cell entry data: 9800 mutations
Sites range: 26 - 529

Total RSV-A sequence variations: 234
Variations with DMS data: 148
Variations without DMS data: 86
At 147 sites

Coverage: 63.2% of sequence variations have DMS data

# Display first rows
print(f"\nFirst 20 sequence variations for {strain}:")
display(with_effects.head(20))

# Filter to variations with DMS data for additional statistics
with_dms = with_effects.dropna(subset=['cell entry'])

if len(with_dms) > 0:
    print(f"\n\nCell Entry Effect Statistics (based on {len(with_dms)} variations with DMS data):")
    print("=" * 60)
    print(f"Mean effect: {with_dms['cell entry'].mean():.3f}")
    print(f"Median effect: {with_dms['cell entry'].median():.3f}")
    print(f"Std deviation: {with_dms['cell entry'].std():.3f}")
    print(f"Effect range: {with_dms['cell entry'].min():.3f} to {with_dms['cell entry'].max():.3f}")
    
    print(f"\nMost variable sites:")
    top_sites = with_effects.groupby('site').size().sort_values(ascending=False).head(10)
    for site, count in top_sites.items():
        wt = with_effects[with_effects['site'] == site]['wildtype'].iloc[0]
        print(f"  Site {site} ({wt}): {count} different mutations observed")
    
    print(f"\nMost common sequence variations with highest effects:")
    top_with_effects = with_dms.nlargest(10, 'cell entry')[['site', 'mutation_type', 'mutation_count', 'cell entry', 'region']]
    display(top_with_effects)
else:
    print(f"\nNo DMS data available for these sequence variations.")

First 20 sequence variations for RSV-A:


Cell Entry Effect Statistics (based on 148 variations with DMS data):
============================================================
Mean effect: -0.204
Median effect: -0.104
Std deviation: 0.652
Effect range: -3.351 to 1.399

Most variable sites:
  Site 19 (T): 4 different mutations observed
  Site 13 (T): 4 different mutations observed
  Site 6 (L): 4 different mutations observed
  Site 125 (T): 4 different mutations observed
  Site 120 (N): 4 different mutations observed
  Site 119 (L): 4 different mutations observed
  Site 124 (K): 4 different mutations observed
  Site 122 (T): 4 different mutations observed
  Site 115 (M): 4 different mutations observed
  Site 16 (A): 3 different mutations observed

Most common sequence variations with highest effects:

# Create output directory if needed
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Save to CSV
with_effects.to_csv(output_file, index=False)
print(f"\nSaved results to: {output_file}")
print(f"Total variations saved: {len(with_effects)}")
print(f"  - With DMS data: {len(with_effects.dropna(subset=['cell entry']))}")
print(f"  - Without DMS data: {len(with_effects[with_effects['cell entry'].isna()])}")

Saved results to: results/sequence_variation/RSV-A_dms_ref_sequence_variations_with_effects.csv
Total variations saved: 234
  - With DMS data: 148
  - Without DMS data: 86

	site	wildtype	mutant	mutation_count	mutation_type
0	2	E	D	219	E→D
1	3	L	S	21	L→S
2	3	L	F	4	L→F
3	4	P	L	34	P→L
4	4	P	S	5	P→S
5	5	I	T	18	I→T
6	5	I	V	3	I→V
7	6	L	I	77	L→I
8	6	L	F	55	L→F
10	6	L	H	17	L→H
9	6	L	S	4	L→S
11	7	K	R	20	K→R
12	7	K	N	13	K→N
13	8	A	T	4902	A→T
15	8	A	S	16	A→S
14	8	A	I	14	A→I
18	9	N	S	12	N→S
16	9	N	I	4	N→I
17	9	N	K	4	N→K
19	10	A	T	16	A→T

	site	mutation_type	mutation_count	cell entry	region
197	516	V→I	4	1.3990	none
70	88	N→T	7	1.1530	region_0
201	519	G→V	8	0.9563	none
72	94	Q→E	19	0.9339	region_0
149	228	N→S	5	0.7754	none
150	228	N→T	5	0.7464	none
107	120	N→Y	4	0.6935	none
155	276	N→S	3917	0.6924	region_II
109	121	N→S	7	0.5973	none
74	100	T→S	7	0.5721	none

Find Variable Sites from RSV F Protein Alignment¶

Configuration¶

Define function to analyze variable sites¶

Analyze alignment¶

Load cell entry data and calculate effects¶

Summary statistics¶

Save results¶

	site	wildtype	mutant	mutation_type	mutation_count	cell entry	sequential_site	region
0	2	E	D	E→D	219	NaN	NaN	<NA>
1	3	L	S	L→S	21	NaN	NaN	<NA>
2	3	L	F	L→F	4	NaN	NaN	<NA>
3	4	P	L	P→L	34	NaN	NaN	<NA>
4	4	P	S	P→S	5	NaN	NaN	<NA>
5	5	I	T	I→T	18	NaN	NaN	<NA>
6	5	I	V	I→V	3	NaN	NaN	<NA>
7	6	L	I	L→I	77	NaN	NaN	<NA>
8	6	L	F	L→F	55	NaN	NaN	<NA>
9	6	L	H	L→H	17	NaN	NaN	<NA>
10	6	L	S	L→S	4	NaN	NaN	<NA>
11	7	K	R	K→R	20	NaN	NaN	<NA>
12	7	K	N	K→N	13	NaN	NaN	<NA>
13	8	A	T	A→T	4902	NaN	NaN	<NA>
14	8	A	S	A→S	16	NaN	NaN	<NA>
15	8	A	I	A→I	14	NaN	NaN	<NA>
16	9	N	S	N→S	12	NaN	NaN	<NA>
17	9	N	I	N→I	4	NaN	NaN	<NA>
18	9	N	K	N→K	4	NaN	NaN	<NA>
19	10	A	T	A→T	16	NaN	NaN	<NA>