make_nipah_phylogeny_baltic¶
Script to pull in a newick tree file, and make a pretty phylogeny with the baltic package https://github.com/evogytis/baltic
- Written by Brendan Larsen
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
input_phylo = None
output_img = None
In [2]:
# Parameters
input_phylo = "data/custom_analyses_data/alignments/phylo/nipah_whole_genome_phylo.tre"
output_img = "results/images/nipah_phylogeny.pdf"
In [3]:
from Bio import Entrez
from Bio import SeqIO
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align.Applications import MafftCommandline
from Bio.Seq import Seq
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patheffects as path_effects
import matplotlib.patches as mpatches
from matplotlib import gridspec,patheffects
import pandas as pd
import os
from matplotlib.patches import Rectangle,ConnectionPatch
import baltic as bt
Setup working directory
In [4]:
if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
Setup in correct directory
Load in newick tree
In [5]:
tree = bt.loadNewick(input_phylo,tip_regex='-([0-9\-]+)$',absoluteTime=False)
Function to plot tree
Function to get country of origin information from genbank files
In [6]:
# Always provide your email address
Entrez.email = "blarsen@fredhutch.org"
def get_country_from_accession_id(accession_ids):
country_info = {}
for accession_id in accession_ids:
# Fetch the record from GenBank
handle = Entrez.efetch(db="nucleotide", id=accession_id, rettype="gb", retmode="text")
# Parse the GenBank record
try:
record = SeqIO.read(handle, "genbank")
except Exception as e:
print(f"Error reading record for {accession_id}: {e}")
handle.close()
continue
handle.close()
# Extract country information from features
country = None
for feature in record.features:
if feature.type == "source":
if "country" in feature.qualifiers:
country = feature.qualifiers["country"][0]
break
country_info[accession_id] = country
return country_info
def read_accession_ids_from_file(file_path):
with open(file_path, 'r') as file:
# Read each line and strip any whitespace or newline characters,
# assuming each line contains one accession ID
accession_ids = [line.strip() for line in file.readlines()]
return accession_ids
file_path = 'data/custom_analyses_data/alignments/phylo/nipah_whole_genome_genbank_accession_ids.txt'
accession_ids = read_accession_ids_from_file(file_path)
# Fetch country information for each accession ID
country_info = get_country_from_accession_id(accession_ids)
print(country_info)
{'NC_002728.1': None, 'OM135495.1': 'India: Kerala State', 'MW535746.1': 'Thailand', 'MK673558.1': 'Malaysia', 'MK673559.1': 'Malaysia', 'MK673560.1': 'Malaysia', 'MK673561.1': 'Malaysia', 'MK673562.1': 'Malaysia', 'MK673563.1': 'Malaysia', 'MK673564.1': 'Bangladesh', 'MK673565.1': 'Bangladesh', 'MK673566.1': 'Bangladesh', 'MK673567.1': 'Bangladesh', 'MK673568.1': 'Bangladesh', 'MK673570.1': 'Bangladesh', 'MK673571.1': 'Bangladesh', 'MK673572.1': 'Bangladesh', 'MK673573.1': 'Bangladesh', 'MK673574.1': 'Bangladesh', 'MK673575.1': 'Bangladesh', 'MK673576.1': 'Bangladesh', 'MK673577.1': 'Bangladesh', 'MK673578.1': 'Bangladesh', 'MK673579.1': 'Bangladesh', 'MK673580.1': 'Bangladesh', 'MK673581.1': 'Bangladesh', 'MK673582.1': 'Bangladesh', 'MK673583.1': 'Bangladesh', 'MK673584.1': 'Bangladesh', 'MK673585.1': 'Bangladesh', 'MK673586.1': 'Bangladesh', 'MK673587.1': 'Bangladesh', 'MK673588.1': 'Bangladesh', 'MK673589.1': 'Bangladesh', 'MK673590.1': 'Bangladesh', 'MK673591.1': 'Bangladesh', 'MK673592.1': 'Bangladesh', 'MN549402.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549403.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549404.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549405.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549406.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549407.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549408.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549409.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549410.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549411.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MK336155.1': 'India: Kozhikode District, Kerala State', 'MK336156.1': 'India: Kozhikode District, Kerala State', 'MK801755.1': 'Cambodia', 'MK575060.1': 'Bangladesh: Raypur, Maniganj', 'MK575061.1': 'Bangladesh: Raypur, Maniganj', 'MK575062.1': 'Bangladesh: Raypur, Maniganj', 'MK575063.1': 'Bangladesh: Raypur, Maniganj', 'MK575064.1': 'Bangladesh: Raypur, Maniganj', 'MK575065.1': 'Bangladesh: Raypur, Maniganj', 'MK575066.1': 'Bangladesh: Raypur, Maniganj', 'MK575067.1': 'Bangladesh: Raypur, Maniganj', 'MK575068.1': 'Bangladesh: Raypur, Maniganj', 'MK575069.1': 'Bangladesh: Raypur, Maniganj', 'MK575070.1': 'Bangladesh: Sylhet', 'MH523640.1': 'India: Kozhikode District, Kerala State', 'MH523641.1': 'India: Kozhikode District, Kerala State', 'MH523642.1': 'India: Kozhikode District, Kerala State', 'MH396625.1': 'India: Kozhikode District, Kerala State', 'KY425646.1': 'Malaysia: Kuala Lumpur', 'KY425655.1': 'Malaysia: Kuala Lumpur', 'JN808857.1': 'Bangladesh: Manikgonj', 'JN808863.1': 'Bangladesh: Rajbari', 'JN808864.1': 'Bangladesh: Faridpur', 'FJ513078.1': 'India', 'AY988601.1': 'Bangladesh', 'AJ627196.1': 'Malaysia', 'AJ564621.1': None, 'AJ564622.1': None, 'AJ564623.1': None, 'AY029767.1': 'Malaysia', 'AY029768.1': 'Malaysia', 'AF212302.2': None}
Convert country information to dataframe
In [7]:
def convert_to_dataframe(country_info):
# Convert dictionary to DataFrame
df = pd.DataFrame(list(country_info.items()), columns=['Accession ID', 'Country'])
return df
# Convert country information to pandas DataFrame
country_df = convert_to_dataframe(country_info)
display(country_df)
fixed_country_df = pd.read_csv('data/custom_analyses_data/alignments/phylo/countrydf.csv')
display(fixed_country_df)
Accession ID | Country | |
---|---|---|
0 | NC_002728.1 | None |
1 | OM135495.1 | India: Kerala State |
2 | MW535746.1 | Thailand |
3 | MK673558.1 | Malaysia |
4 | MK673559.1 | Malaysia |
... | ... | ... |
74 | AJ564622.1 | None |
75 | AJ564623.1 | None |
76 | AY029767.1 | Malaysia |
77 | AY029768.1 | Malaysia |
78 | AF212302.2 | None |
79 rows × 2 columns
Accession ID | Country | |
---|---|---|
0 | NC_002728.1 | Malaysia |
1 | OM135495.1 | India |
2 | MW535746.1 | Thailand |
3 | MK673558.1 | Malaysia |
4 | MK673559.1 | Malaysia |
... | ... | ... |
74 | AJ564622.1 | Malaysia |
75 | AJ564623.1 | Malaysia |
76 | AY029767.1 | Malaysia |
77 | AY029768.1 | Malaysia |
78 | AF212302.2 | Malaysia |
79 rows × 2 columns
In [8]:
def get_accessions_by_country(country_df):
country_dict = {}
country_lists = country_df['Country'].unique()
for country in country_lists:
country_specific_df = country_df[country_df['Country'] == country]
country_dict[country] = list(country_specific_df['Accession ID'])
return country_dict
country_dict = get_accessions_by_country(fixed_country_df)
Now make the tree
In [9]:
def make_tree(tree, country_dict):
mpl.rc('font', family='sans-serif')
mpl.rc('font', serif='Helvetica')
mpl.rc('text', usetex='false')
mpl.rcParams.update({'font.size': 6})
mpl.rcParams['font.weight'] = 'light'
fig = plt.figure(figsize=(7.5, 10), facecolor='w')
gs = gridspec.GridSpec(1, 1, wspace=0.0)
ax = plt.subplot(gs[0], facecolor='w')
### MAP COLORS
colors = ['#5778a4', '#e49444', '#d1615d', '#a87c9f', '#e7ca60']
color_mapping = {country: colors[i % len(colors)] for i, country in enumerate(country_dict.keys())}
color_func = lambda k: color_mapping[next(country for country, ids in country_dict.items() if k.name in ids)]
### DRAW TREE
tree.drawTree()
tree.plotTree(ax, colour='black', width=1, connection_type='baltic')
tree.plotPoints(ax, colour=color_func, size=40, zorder=4)
### ADD LEAF TEXT
x_attr = lambda k: k.x+0.001
kwargs={'va':'center','ha':'left'} ## kwargs for text
#tree.addText(ax,x_attr=x_attr,**kwargs)
### SET MATPLOTLIB INFO
ax.set_xticks([])
ax.set_yticks([])
ax.set_yticklabels([])
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
### MAKE COUNTRY COLOR LEGEND
legend_patches = [mpatches.Patch(color=color, label=country) for country, color in color_mapping.items()]
legend = ax.legend(handles=legend_patches, loc='upper left', bbox_to_anchor=(1, 1), frameon=False, fontsize=6)
plt.tight_layout()
plt.savefig(output_img,dpi=300)
plt.show()
make_tree(tree,country_dict)
In [ ]: