#this cell is tagged as parameters for `papermill` parameterization
nipah_config = None
nipah_alignment = None
entropy_output = None

# Parameters
nipah_config = "nipah_config.yaml"
nipah_alignment = "data/custom_analyses_data/alignments/Nipah_RBP_AA_align.fasta"
entropy_output = "results/entropy/entropy.csv"

import math
import os

import altair as alt

import numpy as np

import pandas as pd

import scipy.stats
from scipy import stats

import subprocess
import tempfile
import yaml
from Bio import Entrez
from Bio import SeqIO
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align.Applications import MuscleCommandline
from Bio.Align.Applications import MafftCommandline
from Bio.Seq import Seq
from Bio.Align import PairwiseAligner

# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

Setup in correct directory

if nipah_alignment is None:
    nipah_config = 'nipah_config.yaml'
    nipah_alignment = 'data/custom_analyses_data/alignments/Nipah_RBP_AA_align.fasta'
    entropy_output = 'results/entropy/entropy.csv'

with open(nipah_config) as f:
    config = yaml.safe_load(f)

def shannon_entropy(column):
    """Compute the Shannon entropy of a column in the alignment."""
    counts = {}
    for aa in column:
        if aa in counts:
            counts[aa] += 1
        else:
            counts[aa] = 1

    entropy = 0.0
    for key in counts:
        freq = counts[key] / len(column)
        entropy += freq * math.log2(freq)
    return -entropy

def fetch_and_align(accession_numbers, email, output_folder="."):
    """
    Fetch sequences from GenBank based on accession numbers, align them,
    and return the alignment as a pandas DataFrame.

    Parameters:
    - accession_numbers: List of accession numbers.
    - email: Email address to be used with NCBI's Entrez.
    - output_folder: The directory where output files will be saved.

    Returns:
    - DataFrame representation of the alignment.
    """
    # Ensure the output directory exists, if not, create it.
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Fetch sequences from GenBank
    Entrez.email = email
    sequences = []
    for acc in accession_numbers:
        handle = Entrez.efetch(db="protein", id=acc, rettype="fasta", retmode="text")
        seq_record = SeqIO.read(handle, "fasta")
        sequences.append(seq_record)
        handle.close()

    # Define file paths
    temp_sequences_path = os.path.join(output_folder, "henipavirus_RBP_sequences.fasta")
    aligned_path = os.path.join(output_folder, "henipavirus_RBP_sequences_aligned.fasta")

    # Write sequences to a temporary fasta file
    SeqIO.write(sequences, temp_sequences_path, "fasta")

    # Align using MUSCLE
    muscle_exe = "/fh/fast/bloom_j/software/miniconda3/envs/BloomLab/bin/muscle"
    muscle_result = subprocess.check_output([muscle_exe, "-align", temp_sequences_path, "-output", aligned_path])

    # Read the aligned sequences
    alignment = AlignIO.read(aligned_path, "fasta")

    # Convert alignment to DataFrame
    alignment_dict = {record.id: list(record.seq) for record in alignment}
    df_alignment = pd.DataFrame(alignment_dict)
    df_alignment = df_alignment.rename(columns={'YP_009094086.1':'cedar','AFH96011.1':'ghana','NP_112027.1':'nipah','NP_047112.2':'hendra','UCY33670.1':'hendra_G2','QDJ04463.1':'nipah_cambodia','QKV44014.1':'nipah_india','YP_009094095.1':'Mojiang','UUV47206.1':'Langya','AJP33320.1':'cedar_2'})
    
    # Calculate and add Shannon entropy for each site to the dataframe
    df_alignment['henipavirus_entropy'] = [shannon_entropy(df_alignment.iloc[i]) for i in range(df_alignment.shape[0])]
    
    return df_alignment

# Pull these genbank sequences
cedar = 'YP_009094086.1'
cedar2 = 'AJP33320.1'
ghana = 'AFH96011.1'
nipah = 'NP_112027.1',
nipah_cambodia = 'QDJ04463.1'
nipah_india = 'QKV44014.1'
hendra = 'NP_047112.2'
hendra_G2 = 'UCY33670.1'

seqs = [cedar, cedar2, ghana, nipah, nipah_cambodia, nipah_india, hendra, hendra_G2]
output_folder = "results/alignments/"
df = fetch_and_align(seqs, "blarsen@fredhutch.org", output_folder)
display(df.head(3))

muscle 5.1.linux64 []  791Gb RAM, 36 cores
Built Feb 24 2022 03:16:15
(C) Copyright 2004-2021 Robert C. Edgar.
https://drive5.com

Input: 8 seqs, avg length 611, max 632

00:00 17Mb   CPU has 36 cores, defaulting to 20 threads

WARNING: Max OMP threads 1

00:02 18Mb    100.0% Calc posteriors
00:02 20Mb    100.0% Consistency (1/2)
00:02 20Mb    100.0% Consistency (2/2)
00:02 20Mb    100.0% UPGMA5
00:02 21Mb    100.0% Refining

# Create a boolean mask for the 'nipah' column
mask = df['nipah'] != '-'
# Use cumsum to count the occurrences and assign it to a new column 'site'
df['site'] = mask.cumsum()
# Reset the count to 0 for rows where 'nipah' is '-'
df.loc[~mask, 'site'] = 'NaN'

#Save entropy file for other notebooks to use
df.to_csv(entropy_output)
display(df.head(5))

relevant_columns = df.drop(columns=['henipavirus_entropy', 'site'])
df['conserved'] = relevant_columns.apply(lambda row: len(set(row)) == 1, axis=1)
conserved_sites = df[df['conserved']]['site'].sort_values().tolist()
print(f" These sites are completely conserved among representative Henipaviruses: {conserved_sites}")
print(f" The number of sites conserved across representative Henipaviruses are: {len(conserved_sites)}")

 These sites are completely conserved among representative Henipaviruses: [1, 17, 23, 38, 63, 97, 101, 104, 105, 107, 114, 119, 124, 146, 148, 151, 153, 158, 160, 162, 165, 181, 189, 203, 216, 220, 222, 229, 231, 233, 235, 240, 253, 254, 263, 282, 295, 324, 345, 355, 365, 382, 393, 395, 398, 439, 440, 454, 455, 457, 459, 460, 462, 479, 485, 486, 487, 488, 493, 494, 499, 500, 503, 506, 508, 509, 510, 524, 526, 528, 530, 532, 533, 534, 535, 540, 546, 561, 565, 566, 573, 574, 575, 579, 588, 589, 590, 598, 601]
 The number of sites conserved across representative Henipaviruses are: 89

# read in nipah alignment
alignment_path = nipah_alignment
alignment = AlignIO.read(alignment_path, "fasta")

# Convert alignment to DataFrame
alignment_dict = {record.id: list(record.seq) for record in alignment}
df_alignment = pd.DataFrame(alignment_dict)

def shannon_entropy_and_mutant_aa(column, wildtype_aa):
    """
    Compute the Shannon entropy of a column in the alignment and return the top amino acid excluding the wildtype.
    
    Parameters:
    - column: A column from a sequence alignment, representing one site across multiple sequences.
    - wildtype_aa: The wildtype (original) amino acid at this position in a reference sequence.
    
    Returns:
    - The Shannon entropy of the column (a measure of diversity).
    - The amino acid variant that appears most frequently, excluding the wildtype.
    """
    # Initialize a dictionary to count occurrences of each amino acid
    counts = {}
    # Iterate through each amino acid in the column
    for aa in column:
        # Ignore gap ('-') and unknown ('X') characters
        if aa not in ["-", "X"]:
            # If the amino acid is already in the dictionary, increment its count
            if aa in counts:
                counts[aa] += 1
            # Otherwise, add it to the dictionary with a count of 1
            else:
                counts[aa] = 1
    
    # If counts is empty after filtering, return 0.0 entropy and None for the mutant amino acid
    if not counts:
        return 0.0, None
      
    # Calculate Shannon entropy
    entropy = 0.0
    for key in counts:
        freq = counts[key] / sum(counts.values())  # Calculate frequency of each amino acid
        entropy += freq * math.log2(freq)  # Add the frequency times the log base 2 of the frequency to the entropy

    # Remove the wildtype amino acid from counts if it's present
    counts.pop(wildtype_aa, None)
    # Sort the amino acids by frequency to find the mutant
    sorted_aas = sorted(counts.items(), key=lambda x: x[1], reverse=True)

    mutant_aa = sorted_aas[0][0] if sorted_aas and sorted_aas[0][1] >= 2 else None
    
    # return entropy
    return -entropy, mutant_aa

# Path to the alignment file
alignment_path = nipah_alignment

# Read the alignment file using BioPython's AlignIO
alignment = AlignIO.read(alignment_path, "fasta")

# Convert the alignment to pandas
alignment_dict = {record.id: list(record.seq) for record in alignment}
df_alignment = pd.DataFrame(alignment_dict)

# Extract the wildtype sequence from the DataFrame
wildtype_series = df_alignment['NC_002728.1_Nipah_virus_complete_genome']

# Compute entropy and mutant amino acid for each site in the alignment
values = [shannon_entropy_and_mutant_aa(df_alignment.iloc[i], wildtype_series[i]) for i in range(df_alignment.shape[0])]
# Unpack the computed values into two lists: entropies and mutants
entropies, mutants = zip(*values)

# Create a final DataFrame to hold the computed values along with site numbers
df_final = pd.DataFrame({
    'site': range(1, len(mutants) + 1),
    'entropy': entropies,
    'wildtype': wildtype_series,
    'mutant': mutants
})

# Filter to get rid of extra site at end
df_final = df_final[df_final['site'] < 603]
display(df_final)

sites_with_mutants = df_final.loc[df_final['mutant'].notnull(), 'site'].tolist()
polymorphisms = list(sites_with_mutants)
data_series = pd.Series(polymorphisms)

# filter out sites that are outside mutagenized region
filtered_series = data_series[data_series > 71]
polymorphisms = list(filtered_series)
polymorphism_length = len(polymorphisms)
print(f'There are {polymorphism_length} polymorphic sites in NiV RBP sequences (in the mutagenized ectodomain): {polymorphisms}')

There are 32 polymorphic sites in NiV RBP sequences (in the mutagenized ectodomain): [82, 89, 135, 172, 228, 236, 274, 288, 299, 325, 328, 329, 335, 339, 344, 376, 381, 384, 385, 386, 404, 421, 423, 424, 426, 427, 470, 478, 481, 498, 502, 545]

def find_other_henipavirus_mutants(df,virus):
    df_comparison = df.rename(columns={virus:'mutant'})
    df_comparison = df_comparison[['mutant','nipah','site']]

    #filter rows to differences
    no_dash_df = df_comparison[~df_comparison['mutant'].str.contains('-') & ~df_comparison['nipah'].str.contains('-')]
    filtered_df = no_dash_df[no_dash_df['mutant'] != no_dash_df['nipah']]    

    sites = list(filtered_df['site'].unique())
    data_series = pd.Series(sites)
    filtered_series = data_series[data_series > 71]
    series = list(filtered_series)
    print(f'{virus} is different from nipah at these sites:\n {series}\n')

find_other_henipavirus_mutants(df,'hendra')
find_other_henipavirus_mutants(df,'cedar')

hendra is different from nipah at these sites:
 [76, 82, 85, 86, 89, 90, 96, 98, 136, 144, 174, 175, 176, 180, 187, 194, 195, 196, 201, 209, 210, 211, 212, 213, 215, 224, 226, 228, 236, 241, 244, 245, 261, 265, 277, 279, 280, 284, 285, 287, 288, 289, 293, 299, 309, 311, 312, 315, 316, 317, 322, 326, 327, 329, 333, 334, 335, 337, 338, 339, 340, 342, 344, 371, 376, 385, 386, 388, 392, 401, 402, 403, 404, 420, 422, 423, 424, 425, 426, 427, 432, 434, 437, 446, 458, 466, 470, 473, 476, 478, 483, 498, 502, 507, 517, 520, 525, 527, 538, 548, 549, 550, 553, 554, 564, 569, 571, 586, 599, 602]

cedar is different from nipah at these sites:
 [72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 89, 92, 93, 94, 95, 96, 98, 99, 100, 102, 106, 108, 109, 110, 111, 112, 113, 115, 116, 117, 118, 120, 121, 122, 123, 125, 126, 127, 129, 131, 132, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 147, 149, 150, 152, 155, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 176, 177, 178, 179, 180, 182, 184, 186, 187, 188, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 204, 206, 207, 210, 211, 212, 213, 214, 215, 217, 218, 219, 223, 224, 225, 226, 228, 230, 232, 234, 236, 238, 241, 242, 243, 244, 246, 247, 248, 249, 250, 251, 252, 255, 256, 261, 262, 266, 267, 268, 269, 270, 271, 272, 273, 275, 276, 277, 278, 280, 281, 283, 284, 286, 287, 289, 290, 291, 292, 294, 296, 297, 299, 300, 301, 302, 303, 304, 305, 308, 310, 312, 313, 314, 315, 316, 318, 319, 320, 321, 322, 325, 326, 327, 328, 329, 330, 332, 333, 340, 341, 342, 343, 346, 347, 348, 349, 350, 351, 353, 354, 356, 357, 358, 360, 361, 362, 363, 364, 366, 367, 368, 369, 370, 371, 372, 373, 374, 376, 377, 378, 379, 380, 383, 384, 385, 386, 387, 389, 391, 392, 394, 396, 397, 399, 400, 401, 402, 403, 406, 407, 408, 409, 410, 411, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 426, 427, 428, 429, 430, 432, 433, 434, 435, 438, 442, 443, 444, 445, 446, 447, 451, 452, 453, 456, 458, 463, 464, 466, 468, 470, 471, 472, 473, 475, 476, 477, 480, 489, 491, 492, 496, 497, 498, 504, 505, 507, 511, 512, 513, 514, 516, 517, 518, 519, 520, 521, 522, 523, 525, 529, 531, 536, 537, 541, 542, 543, 544, 548, 549, 550, 551, 552, 553, 554, 555, 556, 558, 559, 560, 562, 564, 568, 569, 570, 571, 572, 577, 578, 580, 581, 582, 583, 584, 585, 586, 587, 591, 592, 593, 594, 595, 599, 600]

henipavirus_conservation.ipynb¶

For running interactively:¶

Pull represantative henipavirus RBP amino acid sequences from genbank, align, calculate entropy, and convert to a dataframe¶

Make site numbering relative to Nipah reference sequence¶

Calculate Which Sites are 100% conserved across represantative henipavirus sequences¶

Calculate entropy from Nipah sequence alignment of RBP¶

Find polymorphic Nipah sites¶

	hendra	nipah	cedar	nipah_india	nipah_cambodia	cedar_2	ghana	hendra_G2	henipavirus_entropy	site
0	M	M	M	M	M	M	M	M	-0.00	1
1	M	P	L	P	P	L	P	-	1.75	2
2	A	A	S	T	A	S	Q	A	1.75	3
3	D	E	Q	E	E	Q	K	E	1.75	4
4	S	N	L	S	S	L	T	S	1.75	5

	site	entropy	wildtype	mutant
0	1	-0.000000	M	None
1	2	-0.000000	P	None
2	3	0.820741	A	T
3	4	-0.000000	E	None
4	5	0.726963	N	S
...	...	...	...	...
597	598	0.101067	P	None
598	599	-0.000000	E	None
599	600	-0.000000	Q	None
600	601	-0.000000	C	None
601	602	-0.000000	T	None