henipavirus_conservation.ipynb¶

This notebook analyzes features of henipavirus and nipah sequence conservation

  • Written by Brendan Larsen
In [1]:
#this cell is tagged as parameters for `papermill` parameterization
nipah_config = None
nipah_alignment = None
entropy_output = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
nipah_alignment = "data/custom_analyses_data/alignments/Nipah_RBP_AA_align.fasta"
entropy_output = "results/entropy/entropy.csv"
In [3]:
import math
import os

import altair as alt

import numpy as np

import pandas as pd

import scipy.stats
from scipy import stats

import subprocess
import tempfile
import yaml
from Bio import Entrez
from Bio import SeqIO
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align.Applications import MuscleCommandline
from Bio.Align.Applications import MafftCommandline
from Bio.Seq import Seq
from Bio.Align import PairwiseAligner
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")
Setup in correct directory

For running interactively:¶

In [5]:
if nipah_alignment is None:
    nipah_config = 'nipah_config.yaml'
    nipah_alignment = 'data/custom_analyses_data/alignments/Nipah_RBP_AA_align.fasta'
    entropy_output = 'results/entropy/entropy.csv'
In [6]:
with open(nipah_config) as f:
    config = yaml.safe_load(f)

Pull represantative henipavirus RBP amino acid sequences from genbank, align, calculate entropy, and convert to a dataframe¶

In [7]:
def shannon_entropy(column):
    """Compute the Shannon entropy of a column in the alignment."""
    counts = {}
    for aa in column:
        if aa in counts:
            counts[aa] += 1
        else:
            counts[aa] = 1

    entropy = 0.0
    for key in counts:
        freq = counts[key] / len(column)
        entropy += freq * math.log2(freq)
    return -entropy

def fetch_and_align(accession_numbers, email, output_folder="."):
    """
    Fetch sequences from GenBank based on accession numbers, align them,
    and return the alignment as a pandas DataFrame.

    Parameters:
    - accession_numbers: List of accession numbers.
    - email: Email address to be used with NCBI's Entrez.
    - output_folder: The directory where output files will be saved.

    Returns:
    - DataFrame representation of the alignment.
    """
    # Ensure the output directory exists, if not, create it.
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Fetch sequences from GenBank
    Entrez.email = email
    sequences = []
    for acc in accession_numbers:
        handle = Entrez.efetch(db="protein", id=acc, rettype="fasta", retmode="text")
        seq_record = SeqIO.read(handle, "fasta")
        sequences.append(seq_record)
        handle.close()

    # Define file paths
    temp_sequences_path = os.path.join(output_folder, "henipavirus_RBP_sequences.fasta")
    aligned_path = os.path.join(output_folder, "henipavirus_RBP_sequences_aligned.fasta")

    # Write sequences to a temporary fasta file
    SeqIO.write(sequences, temp_sequences_path, "fasta")

    # Align using MUSCLE
    muscle_exe = "/fh/fast/bloom_j/software/miniconda3/envs/BloomLab/bin/muscle"
    muscle_result = subprocess.check_output([muscle_exe, "-align", temp_sequences_path, "-output", aligned_path])

    # Read the aligned sequences
    alignment = AlignIO.read(aligned_path, "fasta")

    # Convert alignment to DataFrame
    alignment_dict = {record.id: list(record.seq) for record in alignment}
    df_alignment = pd.DataFrame(alignment_dict)
    df_alignment = df_alignment.rename(columns={'YP_009094086.1':'cedar','AFH96011.1':'ghana','NP_112027.1':'nipah','NP_047112.2':'hendra','UCY33670.1':'hendra_G2','QDJ04463.1':'nipah_cambodia','QKV44014.1':'nipah_india','YP_009094095.1':'Mojiang','UUV47206.1':'Langya','AJP33320.1':'cedar_2'})
    
    # Calculate and add Shannon entropy for each site to the dataframe
    df_alignment['henipavirus_entropy'] = [shannon_entropy(df_alignment.iloc[i]) for i in range(df_alignment.shape[0])]
    
    return df_alignment

# Pull these genbank sequences
cedar = 'YP_009094086.1'
cedar2 = 'AJP33320.1'
ghana = 'AFH96011.1'
nipah = 'NP_112027.1',
nipah_cambodia = 'QDJ04463.1'
nipah_india = 'QKV44014.1'
hendra = 'NP_047112.2'
hendra_G2 = 'UCY33670.1'

seqs = [cedar, cedar2, ghana, nipah, nipah_cambodia, nipah_india, hendra, hendra_G2]
output_folder = "results/alignments/"
df = fetch_and_align(seqs, "blarsen@fredhutch.org", output_folder)
display(df.head(3))
muscle 5.1.linux64 []  791Gb RAM, 36 cores
Built Feb 24 2022 03:16:15
(C) Copyright 2004-2021 Robert C. Edgar.
https://drive5.com

Input: 8 seqs, avg length 611, max 632

00:00 17Mb   CPU has 36 cores, defaulting to 20 threads

WARNING: Max OMP threads 1

00:02 18Mb    100.0% Calc posteriors
00:02 20Mb    100.0% Consistency (1/2)
00:02 20Mb    100.0% Consistency (2/2)
00:02 20Mb    100.0% UPGMA5
00:02 21Mb    100.0% Refining
hendra nipah cedar nipah_india nipah_cambodia cedar_2 ghana hendra_G2 henipavirus_entropy
0 M M M M M M M M -0.00
1 M P L P P L P - 1.75
2 A A S T A S Q A 1.75

Now we have an alignment of representative henipaviruses in a pandas dataframe format.

Make site numbering relative to Nipah reference sequence¶

In [8]:
# Create a boolean mask for the 'nipah' column
mask = df['nipah'] != '-'
# Use cumsum to count the occurrences and assign it to a new column 'site'
df['site'] = mask.cumsum()
# Reset the count to 0 for rows where 'nipah' is '-'
df.loc[~mask, 'site'] = 'NaN'

#Save entropy file for other notebooks to use
df.to_csv(entropy_output)
display(df.head(5))
hendra nipah cedar nipah_india nipah_cambodia cedar_2 ghana hendra_G2 henipavirus_entropy site
0 M M M M M M M M -0.00 1
1 M P L P P L P - 1.75 2
2 A A S T A S Q A 1.75 3
3 D E Q E E Q K E 1.75 4
4 S N L S S L T S 1.75 5

Calculate Which Sites are 100% conserved across represantative henipavirus sequences¶

In [9]:
relevant_columns = df.drop(columns=['henipavirus_entropy', 'site'])
df['conserved'] = relevant_columns.apply(lambda row: len(set(row)) == 1, axis=1)
conserved_sites = df[df['conserved']]['site'].sort_values().tolist()
print(f" These sites are completely conserved among representative Henipaviruses: {conserved_sites}")
print(f" The number of sites conserved across representative Henipaviruses are: {len(conserved_sites)}")
 These sites are completely conserved among representative Henipaviruses: [1, 17, 23, 38, 63, 97, 101, 104, 105, 107, 114, 119, 124, 146, 148, 151, 153, 158, 160, 162, 165, 181, 189, 203, 216, 220, 222, 229, 231, 233, 235, 240, 253, 254, 263, 282, 295, 324, 345, 355, 365, 382, 393, 395, 398, 439, 440, 454, 455, 457, 459, 460, 462, 479, 485, 486, 487, 488, 493, 494, 499, 500, 503, 506, 508, 509, 510, 524, 526, 528, 530, 532, 533, 534, 535, 540, 546, 561, 565, 566, 573, 574, 575, 579, 588, 589, 590, 598, 601]
 The number of sites conserved across representative Henipaviruses are: 89

Calculate entropy from Nipah sequence alignment of RBP¶

In [10]:
# read in nipah alignment
alignment_path = nipah_alignment
alignment = AlignIO.read(alignment_path, "fasta")

# Convert alignment to DataFrame
alignment_dict = {record.id: list(record.seq) for record in alignment}
df_alignment = pd.DataFrame(alignment_dict)
In [11]:
def shannon_entropy_and_mutant_aa(column, wildtype_aa):
    """
    Compute the Shannon entropy of a column in the alignment and return the top amino acid excluding the wildtype.
    
    Parameters:
    - column: A column from a sequence alignment, representing one site across multiple sequences.
    - wildtype_aa: The wildtype (original) amino acid at this position in a reference sequence.
    
    Returns:
    - The Shannon entropy of the column (a measure of diversity).
    - The amino acid variant that appears most frequently, excluding the wildtype.
    """
    # Initialize a dictionary to count occurrences of each amino acid
    counts = {}
    # Iterate through each amino acid in the column
    for aa in column:
        # Ignore gap ('-') and unknown ('X') characters
        if aa not in ["-", "X"]:
            # If the amino acid is already in the dictionary, increment its count
            if aa in counts:
                counts[aa] += 1
            # Otherwise, add it to the dictionary with a count of 1
            else:
                counts[aa] = 1
    
    # If counts is empty after filtering, return 0.0 entropy and None for the mutant amino acid
    if not counts:
        return 0.0, None
      
    # Calculate Shannon entropy
    entropy = 0.0
    for key in counts:
        freq = counts[key] / sum(counts.values())  # Calculate frequency of each amino acid
        entropy += freq * math.log2(freq)  # Add the frequency times the log base 2 of the frequency to the entropy

    # Remove the wildtype amino acid from counts if it's present
    counts.pop(wildtype_aa, None)
    # Sort the amino acids by frequency to find the mutant
    sorted_aas = sorted(counts.items(), key=lambda x: x[1], reverse=True)

    mutant_aa = sorted_aas[0][0] if sorted_aas and sorted_aas[0][1] >= 2 else None
    
    # return entropy
    return -entropy, mutant_aa

# Path to the alignment file
alignment_path = nipah_alignment

# Read the alignment file using BioPython's AlignIO
alignment = AlignIO.read(alignment_path, "fasta")

# Convert the alignment to pandas
alignment_dict = {record.id: list(record.seq) for record in alignment}
df_alignment = pd.DataFrame(alignment_dict)

# Extract the wildtype sequence from the DataFrame
wildtype_series = df_alignment['NC_002728.1_Nipah_virus_complete_genome']

# Compute entropy and mutant amino acid for each site in the alignment
values = [shannon_entropy_and_mutant_aa(df_alignment.iloc[i], wildtype_series[i]) for i in range(df_alignment.shape[0])]
# Unpack the computed values into two lists: entropies and mutants
entropies, mutants = zip(*values)

# Create a final DataFrame to hold the computed values along with site numbers
df_final = pd.DataFrame({
    'site': range(1, len(mutants) + 1),
    'entropy': entropies,
    'wildtype': wildtype_series,
    'mutant': mutants
})

# Filter to get rid of extra site at end
df_final = df_final[df_final['site'] < 603]
display(df_final)
site entropy wildtype mutant
0 1 -0.000000 M None
1 2 -0.000000 P None
2 3 0.820741 A T
3 4 -0.000000 E None
4 5 0.726963 N S
... ... ... ... ...
597 598 0.101067 P None
598 599 -0.000000 E None
599 600 -0.000000 Q None
600 601 -0.000000 C None
601 602 -0.000000 T None

602 rows × 4 columns

Find polymorphic Nipah sites¶

In [12]:
sites_with_mutants = df_final.loc[df_final['mutant'].notnull(), 'site'].tolist()
polymorphisms = list(sites_with_mutants)
data_series = pd.Series(polymorphisms)

# filter out sites that are outside mutagenized region
filtered_series = data_series[data_series > 71]
polymorphisms = list(filtered_series)
polymorphism_length = len(polymorphisms)
print(f'There are {polymorphism_length} polymorphic sites in NiV RBP sequences (in the mutagenized ectodomain): {polymorphisms}')
There are 32 polymorphic sites in NiV RBP sequences (in the mutagenized ectodomain): [82, 89, 135, 172, 228, 236, 274, 288, 299, 325, 328, 329, 335, 339, 344, 376, 381, 384, 385, 386, 404, 421, 423, 424, 426, 427, 470, 478, 481, 498, 502, 545]
In [13]:
def find_other_henipavirus_mutants(df,virus):
    df_comparison = df.rename(columns={virus:'mutant'})
    df_comparison = df_comparison[['mutant','nipah','site']]

    #filter rows to differences
    no_dash_df = df_comparison[~df_comparison['mutant'].str.contains('-') & ~df_comparison['nipah'].str.contains('-')]
    filtered_df = no_dash_df[no_dash_df['mutant'] != no_dash_df['nipah']]    

    sites = list(filtered_df['site'].unique())
    data_series = pd.Series(sites)
    filtered_series = data_series[data_series > 71]
    series = list(filtered_series)
    print(f'{virus} is different from nipah at these sites:\n {series}\n')

find_other_henipavirus_mutants(df,'hendra')
find_other_henipavirus_mutants(df,'cedar')
hendra is different from nipah at these sites:
 [76, 82, 85, 86, 89, 90, 96, 98, 136, 144, 174, 175, 176, 180, 187, 194, 195, 196, 201, 209, 210, 211, 212, 213, 215, 224, 226, 228, 236, 241, 244, 245, 261, 265, 277, 279, 280, 284, 285, 287, 288, 289, 293, 299, 309, 311, 312, 315, 316, 317, 322, 326, 327, 329, 333, 334, 335, 337, 338, 339, 340, 342, 344, 371, 376, 385, 386, 388, 392, 401, 402, 403, 404, 420, 422, 423, 424, 425, 426, 427, 432, 434, 437, 446, 458, 466, 470, 473, 476, 478, 483, 498, 502, 507, 517, 520, 525, 527, 538, 548, 549, 550, 553, 554, 564, 569, 571, 586, 599, 602]

cedar is different from nipah at these sites:
 [72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 89, 92, 93, 94, 95, 96, 98, 99, 100, 102, 106, 108, 109, 110, 111, 112, 113, 115, 116, 117, 118, 120, 121, 122, 123, 125, 126, 127, 129, 131, 132, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 147, 149, 150, 152, 155, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 176, 177, 178, 179, 180, 182, 184, 186, 187, 188, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 204, 206, 207, 210, 211, 212, 213, 214, 215, 217, 218, 219, 223, 224, 225, 226, 228, 230, 232, 234, 236, 238, 241, 242, 243, 244, 246, 247, 248, 249, 250, 251, 252, 255, 256, 261, 262, 266, 267, 268, 269, 270, 271, 272, 273, 275, 276, 277, 278, 280, 281, 283, 284, 286, 287, 289, 290, 291, 292, 294, 296, 297, 299, 300, 301, 302, 303, 304, 305, 308, 310, 312, 313, 314, 315, 316, 318, 319, 320, 321, 322, 325, 326, 327, 328, 329, 330, 332, 333, 340, 341, 342, 343, 346, 347, 348, 349, 350, 351, 353, 354, 356, 357, 358, 360, 361, 362, 363, 364, 366, 367, 368, 369, 370, 371, 372, 373, 374, 376, 377, 378, 379, 380, 383, 384, 385, 386, 387, 389, 391, 392, 394, 396, 397, 399, 400, 401, 402, 403, 406, 407, 408, 409, 410, 411, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 426, 427, 428, 429, 430, 432, 433, 434, 435, 438, 442, 443, 444, 445, 446, 447, 451, 452, 453, 456, 458, 463, 464, 466, 468, 470, 471, 472, 473, 475, 476, 477, 480, 489, 491, 492, 496, 497, 498, 504, 505, 507, 511, 512, 513, 514, 516, 517, 518, 519, 520, 521, 522, 523, 525, 529, 531, 536, 537, 541, 542, 543, 544, 548, 549, 550, 551, 552, 553, 554, 555, 556, 558, 559, 560, 562, 564, 568, 569, 570, 571, 572, 577, 578, 580, 581, 582, 583, 584, 585, 586, 587, 591, 592, 593, 594, 595, 599, 600]

In [ ]: