henipavirus_conservation.ipynb¶
This notebook analyzes features of henipavirus and nipah sequence conservation
- Written by Brendan Larsen
In [1]:
#this cell is tagged as parameters for `papermill` parameterization
nipah_config = None
nipah_alignment = None
entropy_output = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
nipah_alignment = "data/custom_analyses_data/alignments/Nipah_RBP_AA_align.fasta"
entropy_output = "results/entropy/entropy.csv"
In [3]:
import math
import os
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
from scipy import stats
import subprocess
import tempfile
import yaml
from Bio import Entrez
from Bio import SeqIO
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align.Applications import MuscleCommandline
from Bio.Align.Applications import MafftCommandline
from Bio.Seq import Seq
from Bio.Align import PairwiseAligner
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
Setup in correct directory
For running interactively:¶
In [5]:
if nipah_alignment is None:
nipah_config = 'nipah_config.yaml'
nipah_alignment = 'data/custom_analyses_data/alignments/Nipah_RBP_AA_align.fasta'
entropy_output = 'results/entropy/entropy.csv'
In [6]:
with open(nipah_config) as f:
config = yaml.safe_load(f)
Pull represantative henipavirus RBP amino acid sequences from genbank, align, calculate entropy, and convert to a dataframe¶
In [7]:
def shannon_entropy(column):
"""Compute the Shannon entropy of a column in the alignment."""
counts = {}
for aa in column:
if aa in counts:
counts[aa] += 1
else:
counts[aa] = 1
entropy = 0.0
for key in counts:
freq = counts[key] / len(column)
entropy += freq * math.log2(freq)
return -entropy
def fetch_and_align(accession_numbers, email, output_folder="."):
"""
Fetch sequences from GenBank based on accession numbers, align them,
and return the alignment as a pandas DataFrame.
Parameters:
- accession_numbers: List of accession numbers.
- email: Email address to be used with NCBI's Entrez.
- output_folder: The directory where output files will be saved.
Returns:
- DataFrame representation of the alignment.
"""
# Ensure the output directory exists, if not, create it.
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Fetch sequences from GenBank
Entrez.email = email
sequences = []
for acc in accession_numbers:
handle = Entrez.efetch(db="protein", id=acc, rettype="fasta", retmode="text")
seq_record = SeqIO.read(handle, "fasta")
sequences.append(seq_record)
handle.close()
# Define file paths
temp_sequences_path = os.path.join(output_folder, "henipavirus_RBP_sequences.fasta")
aligned_path = os.path.join(output_folder, "henipavirus_RBP_sequences_aligned.fasta")
# Write sequences to a temporary fasta file
SeqIO.write(sequences, temp_sequences_path, "fasta")
# Align using MUSCLE
muscle_exe = "/fh/fast/bloom_j/software/miniconda3/envs/BloomLab/bin/muscle"
muscle_result = subprocess.check_output([muscle_exe, "-align", temp_sequences_path, "-output", aligned_path])
# Read the aligned sequences
alignment = AlignIO.read(aligned_path, "fasta")
# Convert alignment to DataFrame
alignment_dict = {record.id: list(record.seq) for record in alignment}
df_alignment = pd.DataFrame(alignment_dict)
df_alignment = df_alignment.rename(columns={'YP_009094086.1':'cedar','AFH96011.1':'ghana','NP_112027.1':'nipah','NP_047112.2':'hendra','UCY33670.1':'hendra_G2','QDJ04463.1':'nipah_cambodia','QKV44014.1':'nipah_india','YP_009094095.1':'Mojiang','UUV47206.1':'Langya','AJP33320.1':'cedar_2'})
# Calculate and add Shannon entropy for each site to the dataframe
df_alignment['henipavirus_entropy'] = [shannon_entropy(df_alignment.iloc[i]) for i in range(df_alignment.shape[0])]
return df_alignment
# Pull these genbank sequences
cedar = 'YP_009094086.1'
cedar2 = 'AJP33320.1'
ghana = 'AFH96011.1'
nipah = 'NP_112027.1',
nipah_cambodia = 'QDJ04463.1'
nipah_india = 'QKV44014.1'
hendra = 'NP_047112.2'
hendra_G2 = 'UCY33670.1'
seqs = [cedar, cedar2, ghana, nipah, nipah_cambodia, nipah_india, hendra, hendra_G2]
output_folder = "results/alignments/"
df = fetch_and_align(seqs, "blarsen@fredhutch.org", output_folder)
display(df.head(3))
muscle 5.1.linux64 [] 791Gb RAM, 36 cores Built Feb 24 2022 03:16:15 (C) Copyright 2004-2021 Robert C. Edgar. https://drive5.com Input: 8 seqs, avg length 611, max 632 00:00 17Mb CPU has 36 cores, defaulting to 20 threads WARNING: Max OMP threads 1 00:02 18Mb 100.0% Calc posteriors 00:02 20Mb 100.0% Consistency (1/2) 00:02 20Mb 100.0% Consistency (2/2) 00:02 20Mb 100.0% UPGMA5 00:02 21Mb 100.0% Refining
hendra | nipah | cedar | nipah_india | nipah_cambodia | cedar_2 | ghana | hendra_G2 | henipavirus_entropy | |
---|---|---|---|---|---|---|---|---|---|
0 | M | M | M | M | M | M | M | M | -0.00 |
1 | M | P | L | P | P | L | P | - | 1.75 |
2 | A | A | S | T | A | S | Q | A | 1.75 |
Now we have an alignment of representative henipaviruses in a pandas dataframe format.
Make site numbering relative to Nipah reference sequence¶
In [8]:
# Create a boolean mask for the 'nipah' column
mask = df['nipah'] != '-'
# Use cumsum to count the occurrences and assign it to a new column 'site'
df['site'] = mask.cumsum()
# Reset the count to 0 for rows where 'nipah' is '-'
df.loc[~mask, 'site'] = 'NaN'
#Save entropy file for other notebooks to use
df.to_csv(entropy_output)
display(df.head(5))
hendra | nipah | cedar | nipah_india | nipah_cambodia | cedar_2 | ghana | hendra_G2 | henipavirus_entropy | site | |
---|---|---|---|---|---|---|---|---|---|---|
0 | M | M | M | M | M | M | M | M | -0.00 | 1 |
1 | M | P | L | P | P | L | P | - | 1.75 | 2 |
2 | A | A | S | T | A | S | Q | A | 1.75 | 3 |
3 | D | E | Q | E | E | Q | K | E | 1.75 | 4 |
4 | S | N | L | S | S | L | T | S | 1.75 | 5 |
Calculate Which Sites are 100% conserved across represantative henipavirus sequences¶
In [9]:
relevant_columns = df.drop(columns=['henipavirus_entropy', 'site'])
df['conserved'] = relevant_columns.apply(lambda row: len(set(row)) == 1, axis=1)
conserved_sites = df[df['conserved']]['site'].sort_values().tolist()
print(f" These sites are completely conserved among representative Henipaviruses: {conserved_sites}")
print(f" The number of sites conserved across representative Henipaviruses are: {len(conserved_sites)}")
These sites are completely conserved among representative Henipaviruses: [1, 17, 23, 38, 63, 97, 101, 104, 105, 107, 114, 119, 124, 146, 148, 151, 153, 158, 160, 162, 165, 181, 189, 203, 216, 220, 222, 229, 231, 233, 235, 240, 253, 254, 263, 282, 295, 324, 345, 355, 365, 382, 393, 395, 398, 439, 440, 454, 455, 457, 459, 460, 462, 479, 485, 486, 487, 488, 493, 494, 499, 500, 503, 506, 508, 509, 510, 524, 526, 528, 530, 532, 533, 534, 535, 540, 546, 561, 565, 566, 573, 574, 575, 579, 588, 589, 590, 598, 601] The number of sites conserved across representative Henipaviruses are: 89
Calculate entropy from Nipah sequence alignment of RBP¶
In [10]:
# read in nipah alignment
alignment_path = nipah_alignment
alignment = AlignIO.read(alignment_path, "fasta")
# Convert alignment to DataFrame
alignment_dict = {record.id: list(record.seq) for record in alignment}
df_alignment = pd.DataFrame(alignment_dict)
In [11]:
def shannon_entropy_and_mutant_aa(column, wildtype_aa):
"""
Compute the Shannon entropy of a column in the alignment and return the top amino acid excluding the wildtype.
Parameters:
- column: A column from a sequence alignment, representing one site across multiple sequences.
- wildtype_aa: The wildtype (original) amino acid at this position in a reference sequence.
Returns:
- The Shannon entropy of the column (a measure of diversity).
- The amino acid variant that appears most frequently, excluding the wildtype.
"""
# Initialize a dictionary to count occurrences of each amino acid
counts = {}
# Iterate through each amino acid in the column
for aa in column:
# Ignore gap ('-') and unknown ('X') characters
if aa not in ["-", "X"]:
# If the amino acid is already in the dictionary, increment its count
if aa in counts:
counts[aa] += 1
# Otherwise, add it to the dictionary with a count of 1
else:
counts[aa] = 1
# If counts is empty after filtering, return 0.0 entropy and None for the mutant amino acid
if not counts:
return 0.0, None
# Calculate Shannon entropy
entropy = 0.0
for key in counts:
freq = counts[key] / sum(counts.values()) # Calculate frequency of each amino acid
entropy += freq * math.log2(freq) # Add the frequency times the log base 2 of the frequency to the entropy
# Remove the wildtype amino acid from counts if it's present
counts.pop(wildtype_aa, None)
# Sort the amino acids by frequency to find the mutant
sorted_aas = sorted(counts.items(), key=lambda x: x[1], reverse=True)
mutant_aa = sorted_aas[0][0] if sorted_aas and sorted_aas[0][1] >= 2 else None
# return entropy
return -entropy, mutant_aa
# Path to the alignment file
alignment_path = nipah_alignment
# Read the alignment file using BioPython's AlignIO
alignment = AlignIO.read(alignment_path, "fasta")
# Convert the alignment to pandas
alignment_dict = {record.id: list(record.seq) for record in alignment}
df_alignment = pd.DataFrame(alignment_dict)
# Extract the wildtype sequence from the DataFrame
wildtype_series = df_alignment['NC_002728.1_Nipah_virus_complete_genome']
# Compute entropy and mutant amino acid for each site in the alignment
values = [shannon_entropy_and_mutant_aa(df_alignment.iloc[i], wildtype_series[i]) for i in range(df_alignment.shape[0])]
# Unpack the computed values into two lists: entropies and mutants
entropies, mutants = zip(*values)
# Create a final DataFrame to hold the computed values along with site numbers
df_final = pd.DataFrame({
'site': range(1, len(mutants) + 1),
'entropy': entropies,
'wildtype': wildtype_series,
'mutant': mutants
})
# Filter to get rid of extra site at end
df_final = df_final[df_final['site'] < 603]
display(df_final)
site | entropy | wildtype | mutant | |
---|---|---|---|---|
0 | 1 | -0.000000 | M | None |
1 | 2 | -0.000000 | P | None |
2 | 3 | 0.820741 | A | T |
3 | 4 | -0.000000 | E | None |
4 | 5 | 0.726963 | N | S |
... | ... | ... | ... | ... |
597 | 598 | 0.101067 | P | None |
598 | 599 | -0.000000 | E | None |
599 | 600 | -0.000000 | Q | None |
600 | 601 | -0.000000 | C | None |
601 | 602 | -0.000000 | T | None |
602 rows × 4 columns
Find polymorphic Nipah sites¶
In [12]:
sites_with_mutants = df_final.loc[df_final['mutant'].notnull(), 'site'].tolist()
polymorphisms = list(sites_with_mutants)
data_series = pd.Series(polymorphisms)
# filter out sites that are outside mutagenized region
filtered_series = data_series[data_series > 71]
polymorphisms = list(filtered_series)
polymorphism_length = len(polymorphisms)
print(f'There are {polymorphism_length} polymorphic sites in NiV RBP sequences (in the mutagenized ectodomain): {polymorphisms}')
There are 32 polymorphic sites in NiV RBP sequences (in the mutagenized ectodomain): [82, 89, 135, 172, 228, 236, 274, 288, 299, 325, 328, 329, 335, 339, 344, 376, 381, 384, 385, 386, 404, 421, 423, 424, 426, 427, 470, 478, 481, 498, 502, 545]
In [13]:
def find_other_henipavirus_mutants(df,virus):
df_comparison = df.rename(columns={virus:'mutant'})
df_comparison = df_comparison[['mutant','nipah','site']]
#filter rows to differences
no_dash_df = df_comparison[~df_comparison['mutant'].str.contains('-') & ~df_comparison['nipah'].str.contains('-')]
filtered_df = no_dash_df[no_dash_df['mutant'] != no_dash_df['nipah']]
sites = list(filtered_df['site'].unique())
data_series = pd.Series(sites)
filtered_series = data_series[data_series > 71]
series = list(filtered_series)
print(f'{virus} is different from nipah at these sites:\n {series}\n')
find_other_henipavirus_mutants(df,'hendra')
find_other_henipavirus_mutants(df,'cedar')
hendra is different from nipah at these sites: [76, 82, 85, 86, 89, 90, 96, 98, 136, 144, 174, 175, 176, 180, 187, 194, 195, 196, 201, 209, 210, 211, 212, 213, 215, 224, 226, 228, 236, 241, 244, 245, 261, 265, 277, 279, 280, 284, 285, 287, 288, 289, 293, 299, 309, 311, 312, 315, 316, 317, 322, 326, 327, 329, 333, 334, 335, 337, 338, 339, 340, 342, 344, 371, 376, 385, 386, 388, 392, 401, 402, 403, 404, 420, 422, 423, 424, 425, 426, 427, 432, 434, 437, 446, 458, 466, 470, 473, 476, 478, 483, 498, 502, 507, 517, 520, 525, 527, 538, 548, 549, 550, 553, 554, 564, 569, 571, 586, 599, 602] cedar is different from nipah at these sites: [72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 89, 92, 93, 94, 95, 96, 98, 99, 100, 102, 106, 108, 109, 110, 111, 112, 113, 115, 116, 117, 118, 120, 121, 122, 123, 125, 126, 127, 129, 131, 132, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 147, 149, 150, 152, 155, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 176, 177, 178, 179, 180, 182, 184, 186, 187, 188, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 204, 206, 207, 210, 211, 212, 213, 214, 215, 217, 218, 219, 223, 224, 225, 226, 228, 230, 232, 234, 236, 238, 241, 242, 243, 244, 246, 247, 248, 249, 250, 251, 252, 255, 256, 261, 262, 266, 267, 268, 269, 270, 271, 272, 273, 275, 276, 277, 278, 280, 281, 283, 284, 286, 287, 289, 290, 291, 292, 294, 296, 297, 299, 300, 301, 302, 303, 304, 305, 308, 310, 312, 313, 314, 315, 316, 318, 319, 320, 321, 322, 325, 326, 327, 328, 329, 330, 332, 333, 340, 341, 342, 343, 346, 347, 348, 349, 350, 351, 353, 354, 356, 357, 358, 360, 361, 362, 363, 364, 366, 367, 368, 369, 370, 371, 372, 373, 374, 376, 377, 378, 379, 380, 383, 384, 385, 386, 387, 389, 391, 392, 394, 396, 397, 399, 400, 401, 402, 403, 406, 407, 408, 409, 410, 411, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 426, 427, 428, 429, 430, 432, 433, 434, 435, 438, 442, 443, 444, 445, 446, 447, 451, 452, 453, 456, 458, 463, 464, 466, 468, 470, 471, 472, 473, 475, 476, 477, 480, 489, 491, 492, 496, 497, 498, 504, 505, 507, 511, 512, 513, 514, 516, 517, 518, 519, 520, 521, 522, 523, 525, 529, 531, 536, 537, 541, 542, 543, 544, 548, 549, 550, 551, 552, 553, 554, 555, 556, 558, 559, 560, 562, 564, 568, 569, 570, 571, 572, 577, 578, 580, 581, 582, 583, 584, 585, 586, 587, 591, 592, 593, 594, 595, 599, 600]
In [ ]: