# Parameters - will be overridden by papermill
strain = "RSV-A"
alignment_file = "data/RSV_F_seqs/RSV-A_with_ref.fasta"
output_file = "results/sequence_variation/RSV-A_sequence_variations_with_effects.csv"
cell_entry_file = "results/summaries/cell_entry.csv"
min_mutation_count = 4
reference_name = "RSV_Long_F"  # Name of reference sequence in alignment
analysis_description = "Differences from Strain Used in Deep Mutational Scanning"
mutation_identified_relative_to = "RSV_Long_F"
effects_calculated_relative_to = "RSV_Long_F"

# Parameters
strain = "RSV-B"
alignment_file = "data/RSV_F_seqs/RSV-B_with_ref.fasta"
output_file = (
    "results/sequence_variation/RSV-B_dms_ref_sequence_variations_with_effects.csv"
)
cell_entry_file = "results/summaries/cell_entry.csv"
reference_name = "RSV_Long_F"
analysis_description = "Differences from Strain Used in Deep Mutational Scanning"
mutation_identified_relative_to = "RSV_Long_F"
effects_calculated_relative_to = "RSV_Long_F"
min_mutation_count = 3

import pandas as pd
import numpy as np
from Bio import SeqIO
from collections import Counter
import os

print(f"Analyzing strain: {strain}")
print(f"Analysis type: {analysis_description}")
print(f"Input alignment: {alignment_file}")
print(f"Reference sequence: {reference_name}")
print(f"Differences identified relative to: {mutation_identified_relative_to}")
print(f"Effects calculated relative to: {effects_calculated_relative_to}")
print(f"Cell entry data: {cell_entry_file}")
print(f"Output file: {output_file}")
print(f"Minimum mutation count threshold: {min_mutation_count}")

Analyzing strain: RSV-B
Analysis type: Differences from Strain Used in Deep Mutational Scanning
Input alignment: data/RSV_F_seqs/RSV-B_with_ref.fasta
Reference sequence: RSV_Long_F
Differences identified relative to: RSV_Long_F
Effects calculated relative to: RSV_Long_F
Cell entry data: results/summaries/cell_entry.csv
Output file: results/sequence_variation/RSV-B_dms_ref_sequence_variations_with_effects.csv
Minimum mutation count threshold: 3

def analyze_variable_sites_detailed(fasta_file, reference_name, min_mutation_count=5):
    """
    Analyze variable sites in a multiple sequence alignment.
    
    Parameters:
    -----------
    fasta_file : str
        Path to FASTA alignment file
    reference_name : str
        Name/ID of the reference sequence to use for comparison
    min_mutation_count : int
        Minimum number of sequences with a mutation for it to be reported
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with columns: site, wildtype, mutant, mutation_count, mutation_type
    """
    # Parse all sequences
    sequences = list(SeqIO.parse(fasta_file, 'fasta'))
    
    # Find the reference sequence by name
    reference = None
    for seq in sequences:
        if seq.id == reference_name:
            reference = seq
            break
    
    if reference is None:
        raise ValueError(f"Reference sequence '{reference_name}' not found in alignment")
    
    ref_name = reference.id
    ref_seq = str(reference.seq)
    
    print(f"Reference sequence: {ref_name}")
    print(f"Reference length: {len(ref_seq)} amino acids")
    print(f"Total sequences in alignment: {len(sequences)}")
    
    # Track mutations at each position
    variable_sites = []
    
    # Iterate through each position in the reference
    for pos in range(len(ref_seq)):
        site = pos + 1  # 1-based numbering
        ref_aa = ref_seq[pos]
        
        # Skip stop codons and gaps in reference
        if ref_aa == '*' or ref_aa == '-':
            continue
        
        # Collect all amino acids at this position across sequences
        mutations = []
        for seq in sequences:
            # Skip the reference itself
            if seq.id == reference_name:
                continue
                
            if pos < len(seq.seq):
                seq_aa = str(seq.seq[pos])
                # Only count if different from reference and not a stop or gap
                if seq_aa != ref_aa and seq_aa != '*' and seq_aa != '-':
                    mutations.append(seq_aa)
        
        # Count mutation frequencies
        if mutations:
            mutation_counts = Counter(mutations)
            
            # Report mutations meeting the threshold
            for mutant_aa, count in mutation_counts.items():
                if count >= min_mutation_count:
                    variable_sites.append({
                        'site': site,
                        'wildtype': ref_aa,
                        'mutant': mutant_aa,
                        'mutation_count': count,
                        'mutation_type': f"{ref_aa}→{mutant_aa}"
                    })
    
    # Create DataFrame and sort
    df = pd.DataFrame(variable_sites)
    if len(df) > 0:
        df = df.sort_values(['site', 'mutation_count'], ascending=[True, False])
    
    print(f"\nFound {len(df)} sequence variations at {df['site'].nunique()} sites")
    print(f"(appearing in at least {min_mutation_count} sequences)")
    
    return df

# Run analysis
variable_sites = analyze_variable_sites_detailed(
    alignment_file,
    reference_name,
    min_mutation_count=min_mutation_count
)

# Display first rows
print(f"\nFirst 20 sequence variations for {strain}:")
display(variable_sites.head(20))

Reference sequence: RSV_Long_F
Reference length: 574 amino acids
Total sequences in alignment: 5157

Found 220 sequence variations at 149 sites
(appearing in at least 3 sequences)

First 20 sequence variations for RSV-B:

# Load cell entry effects
cell_entry = pd.read_csv(cell_entry_file)
print(f"Loaded cell entry data: {len(cell_entry)} mutations")
print(f"Sites range: {cell_entry['site'].min()} - {cell_entry['site'].max()}")

# Build lookup dictionary: effects[site][amino_acid] = cell_entry_effect
effects = {}
site_info = {}  # Store sequential_site and region for each site

for _, row in cell_entry.iterrows():
    site = row['site']
    if site not in effects:
        effects[site] = {}
        site_info[site] = {
            'sequential_site': row['sequential_site'],
            'region': row['region']
        }
    effects[site][row['mutant']] = row['cell entry']

# Calculate effect for each sequence variation
variation_effects = []
variations_with_dms_data = 0

for _, var in variable_sites.iterrows():
    site = var['site']
    wt = var['wildtype']  # Wildtype from DMS strain
    mut = var['mutant']   # Amino acid in natural sequence
    
    # Check if we have DMS measurement for this mutant at this site
    if site in effects and mut in effects[site]:
        # Get the DMS cell entry effect for this mutation
        # (DMS effects are already normalized relative to wildtype)
        cell_entry_effect = effects[site][mut]
        variations_with_dms_data += 1
        
        variation_effects.append({
            'site': site,
            'wildtype': wt,
            'mutant': mut,
            'mutation_type': var['mutation_type'],
            'mutation_count': var['mutation_count'],
            'cell entry': cell_entry_effect,
            'sequential_site': site_info[site]['sequential_site'],
            'region': site_info[site]['region']
        })
    else:
        # Include variation even without DMS data, with NaN for effect columns
        variation_effects.append({
            'site': site,
            'wildtype': wt,
            'mutant': mut,
            'mutation_type': var['mutation_type'],
            'mutation_count': var['mutation_count'],
            'cell entry': np.nan,
            'sequential_site': site_info.get(site, {}).get('sequential_site', np.nan),
            'region': site_info.get(site, {}).get('region', pd.NA)
        })

with_effects = pd.DataFrame(variation_effects)

print(f"\nTotal {strain} sequence variations: {len(with_effects)}")
print(f"Variations with DMS data: {variations_with_dms_data}")
print(f"Variations without DMS data: {len(with_effects) - variations_with_dms_data}")
print(f"At {with_effects['site'].nunique()} sites")
print(f"\nCoverage: {variations_with_dms_data / len(with_effects) * 100:.1f}% of sequence variations have DMS data")

Loaded cell entry data: 9800 mutations
Sites range: 26 - 529

Total RSV-B sequence variations: 220
Variations with DMS data: 154
Variations without DMS data: 66
At 149 sites

Coverage: 70.0% of sequence variations have DMS data

# Display first rows
print(f"\nFirst 20 sequence variations for {strain}:")
display(with_effects.head(20))

# Filter to variations with DMS data for additional statistics
with_dms = with_effects.dropna(subset=['cell entry'])

if len(with_dms) > 0:
    print(f"\n\nCell Entry Effect Statistics (based on {len(with_dms)} variations with DMS data):")
    print("=" * 60)
    print(f"Mean effect: {with_dms['cell entry'].mean():.3f}")
    print(f"Median effect: {with_dms['cell entry'].median():.3f}")
    print(f"Std deviation: {with_dms['cell entry'].std():.3f}")
    print(f"Effect range: {with_dms['cell entry'].min():.3f} to {with_dms['cell entry'].max():.3f}")
    
    print(f"\nMost variable sites:")
    top_sites = with_effects.groupby('site').size().sort_values(ascending=False).head(10)
    for site, count in top_sites.items():
        wt = with_effects[with_effects['site'] == site]['wildtype'].iloc[0]
        print(f"  Site {site} ({wt}): {count} different mutations observed")
    
    print(f"\nMost common sequence variations with highest effects:")
    top_with_effects = with_dms.nlargest(10, 'cell entry')[['site', 'mutation_type', 'mutation_count', 'cell entry', 'region']]
    display(top_with_effects)
else:
    print(f"\nNo DMS data available for these sequence variations.")

First 20 sequence variations for RSV-B:


Cell Entry Effect Statistics (based on 154 variations with DMS data):
============================================================
Mean effect: -0.288
Median effect: -0.168
Std deviation: 0.774
Effect range: -3.351 to 1.399

Most variable sites:
  Site 129 (L): 4 different mutations observed
  Site 18 (V): 3 different mutations observed
  Site 22 (F): 3 different mutations observed
  Site 12 (T): 3 different mutations observed
  Site 6 (L): 3 different mutations observed
  Site 209 (K): 3 different mutations observed
  Site 119 (L): 3 different mutations observed
  Site 123 (K): 3 different mutations observed
  Site 121 (N): 3 different mutations observed
  Site 113 (R): 3 different mutations observed

Most common sequence variations with highest effects:

# Create output directory if needed
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Save to CSV
with_effects.to_csv(output_file, index=False)
print(f"\nSaved results to: {output_file}")
print(f"Total variations saved: {len(with_effects)}")
print(f"  - With DMS data: {len(with_effects.dropna(subset=['cell entry']))}")
print(f"  - Without DMS data: {len(with_effects[with_effects['cell entry'].isna()])}")

Saved results to: results/sequence_variation/RSV-B_dms_ref_sequence_variations_with_effects.csv
Total variations saved: 220
  - With DMS data: 154
  - Without DMS data: 66

	site	wildtype	mutant	mutation_count	mutation_type
0	2	E	G	7	E→G
2	3	L	S	4	L→S
1	3	L	V	3	L→V
3	4	P	L	5113	P→L
4	4	P	M	18	P→M
5	5	I	V	48	I→V
6	5	I	T	5	I→T
7	6	L	H	5124	L→H
9	6	L	Y	19	L→Y
8	6	L	R	13	L→R
10	7	K	R	5132	K→R
11	7	K	G	3	K→G
12	8	A	S	5147	A→S
13	8	A	P	7	A→P
14	9	N	S	5140	N→S
15	11	I	V	15	I→V
16	12	T	F	4942	T→F
17	12	T	L	114	T→L
18	12	T	I	99	T→I
19	13	T	L	5148	T→L

	site	mutation_type	mutation_count	cell entry	region
191	516	V→I	3	1.3990	none
51	68	K→Q	12	0.9148	region_0
48	67	N→T	5135	0.9107	region_0
200	529	T→V	170	0.8206	none
170	402	V→I	5150	0.7811	none
138	244	T→A	3	0.7755	none
135	228	N→S	5153	0.7754	none
144	276	N→S	5035	0.6924	region_II
172	419	K→E	24	0.6714	none
153	312	P→H	121	0.6659	region_III

Find Variable Sites from RSV F Protein Alignment¶

Configuration¶

Define function to analyze variable sites¶

Analyze alignment¶

Load cell entry data and calculate effects¶

Summary statistics¶

Save results¶

	site	wildtype	mutant	mutation_type	mutation_count	cell entry	sequential_site	region
0	2	E	G	E→G	7	NaN	NaN	<NA>
1	3	L	S	L→S	4	NaN	NaN	<NA>
2	3	L	V	L→V	3	NaN	NaN	<NA>
3	4	P	L	P→L	5113	NaN	NaN	<NA>
4	4	P	M	P→M	18	NaN	NaN	<NA>
5	5	I	V	I→V	48	NaN	NaN	<NA>
6	5	I	T	I→T	5	NaN	NaN	<NA>
7	6	L	H	L→H	5124	NaN	NaN	<NA>
8	6	L	Y	L→Y	19	NaN	NaN	<NA>
9	6	L	R	L→R	13	NaN	NaN	<NA>
10	7	K	R	K→R	5132	NaN	NaN	<NA>
11	7	K	G	K→G	3	NaN	NaN	<NA>
12	8	A	S	A→S	5147	NaN	NaN	<NA>
13	8	A	P	A→P	7	NaN	NaN	<NA>
14	9	N	S	N→S	5140	NaN	NaN	<NA>
15	11	I	V	I→V	15	NaN	NaN	<NA>
16	12	T	F	T→F	4942	NaN	NaN	<NA>
17	12	T	L	T→L	114	NaN	NaN	<NA>
18	12	T	I	T→I	99	NaN	NaN	<NA>
19	13	T	L	T→L	5148	NaN	NaN	<NA>