# this cell is tagged as parameters for `papermill` parameterization
input_phylo = None
output_img = None

# Parameters
input_phylo = "data/custom_analyses_data/alignments/phylo/nipah_whole_genome_phylo.tre"
output_img = "results/images/nipah_phylogeny.pdf"

from Bio import Entrez
from Bio import SeqIO
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align.Applications import MafftCommandline
from Bio.Seq import Seq

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patheffects as path_effects
import matplotlib.patches as mpatches
from matplotlib import gridspec,patheffects
import pandas as pd
import os
from matplotlib.patches import Rectangle,ConnectionPatch
import baltic as bt

if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

Setup in correct directory

tree = bt.loadNewick(input_phylo,tip_regex='-([0-9\-]+)$',absoluteTime=False)

# Always provide your email address
Entrez.email = "blarsen@fredhutch.org"

def get_country_from_accession_id(accession_ids):
    country_info = {}
    
    for accession_id in accession_ids:
        # Fetch the record from GenBank
        handle = Entrez.efetch(db="nucleotide", id=accession_id, rettype="gb", retmode="text")
        
        # Parse the GenBank record
        try:
            record = SeqIO.read(handle, "genbank")
        except Exception as e:
            print(f"Error reading record for {accession_id}: {e}")
            handle.close()
            continue
        
        handle.close()
        
        # Extract country information from features
        country = None
        for feature in record.features:
            if feature.type == "source":
                if "country" in feature.qualifiers:
                    country = feature.qualifiers["country"][0]
                    break
        
        country_info[accession_id] = country
    
    return country_info

def read_accession_ids_from_file(file_path):
    with open(file_path, 'r') as file:
        # Read each line and strip any whitespace or newline characters,
        # assuming each line contains one accession ID
        accession_ids = [line.strip() for line in file.readlines()]
    return accession_ids


file_path = 'data/custom_analyses_data/alignments/phylo/nipah_whole_genome_genbank_accession_ids.txt'
accession_ids = read_accession_ids_from_file(file_path)

# Fetch country information for each accession ID
country_info = get_country_from_accession_id(accession_ids)
print(country_info)

{'NC_002728.1': None, 'OM135495.1': 'India: Kerala State', 'MW535746.1': 'Thailand', 'MK673558.1': 'Malaysia', 'MK673559.1': 'Malaysia', 'MK673560.1': 'Malaysia', 'MK673561.1': 'Malaysia', 'MK673562.1': 'Malaysia', 'MK673563.1': 'Malaysia', 'MK673564.1': 'Bangladesh', 'MK673565.1': 'Bangladesh', 'MK673566.1': 'Bangladesh', 'MK673567.1': 'Bangladesh', 'MK673568.1': 'Bangladesh', 'MK673570.1': 'Bangladesh', 'MK673571.1': 'Bangladesh', 'MK673572.1': 'Bangladesh', 'MK673573.1': 'Bangladesh', 'MK673574.1': 'Bangladesh', 'MK673575.1': 'Bangladesh', 'MK673576.1': 'Bangladesh', 'MK673577.1': 'Bangladesh', 'MK673578.1': 'Bangladesh', 'MK673579.1': 'Bangladesh', 'MK673580.1': 'Bangladesh', 'MK673581.1': 'Bangladesh', 'MK673582.1': 'Bangladesh', 'MK673583.1': 'Bangladesh', 'MK673584.1': 'Bangladesh', 'MK673585.1': 'Bangladesh', 'MK673586.1': 'Bangladesh', 'MK673587.1': 'Bangladesh', 'MK673588.1': 'Bangladesh', 'MK673589.1': 'Bangladesh', 'MK673590.1': 'Bangladesh', 'MK673591.1': 'Bangladesh', 'MK673592.1': 'Bangladesh', 'MN549402.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549403.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549404.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549405.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549406.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549407.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549408.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549409.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549410.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MN549411.1': 'India: Thodupuza village, Idukki district, Kerala State', 'MK336155.1': 'India: Kozhikode District, Kerala State', 'MK336156.1': 'India: Kozhikode District, Kerala State', 'MK801755.1': 'Cambodia', 'MK575060.1': 'Bangladesh: Raypur, Maniganj', 'MK575061.1': 'Bangladesh: Raypur, Maniganj', 'MK575062.1': 'Bangladesh: Raypur, Maniganj', 'MK575063.1': 'Bangladesh: Raypur, Maniganj', 'MK575064.1': 'Bangladesh: Raypur, Maniganj', 'MK575065.1': 'Bangladesh: Raypur, Maniganj', 'MK575066.1': 'Bangladesh: Raypur, Maniganj', 'MK575067.1': 'Bangladesh: Raypur, Maniganj', 'MK575068.1': 'Bangladesh: Raypur, Maniganj', 'MK575069.1': 'Bangladesh: Raypur, Maniganj', 'MK575070.1': 'Bangladesh: Sylhet', 'MH523640.1': 'India: Kozhikode District, Kerala State', 'MH523641.1': 'India: Kozhikode District, Kerala State', 'MH523642.1': 'India: Kozhikode District, Kerala State', 'MH396625.1': 'India: Kozhikode District, Kerala State', 'KY425646.1': 'Malaysia: Kuala Lumpur', 'KY425655.1': 'Malaysia: Kuala Lumpur', 'JN808857.1': 'Bangladesh: Manikgonj', 'JN808863.1': 'Bangladesh: Rajbari', 'JN808864.1': 'Bangladesh: Faridpur', 'FJ513078.1': 'India', 'AY988601.1': 'Bangladesh', 'AJ627196.1': 'Malaysia', 'AJ564621.1': None, 'AJ564622.1': None, 'AJ564623.1': None, 'AY029767.1': 'Malaysia', 'AY029768.1': 'Malaysia', 'AF212302.2': None}

def convert_to_dataframe(country_info):
    # Convert dictionary to DataFrame
    df = pd.DataFrame(list(country_info.items()), columns=['Accession ID', 'Country'])
    return df
    
# Convert country information to pandas DataFrame
country_df = convert_to_dataframe(country_info)
display(country_df)

fixed_country_df = pd.read_csv('data/custom_analyses_data/alignments/phylo/countrydf.csv')
display(fixed_country_df)

def get_accessions_by_country(country_df):
    country_dict = {}
    country_lists = country_df['Country'].unique()
    for country in country_lists:
        country_specific_df = country_df[country_df['Country'] == country]
        country_dict[country] = list(country_specific_df['Accession ID'])
    return country_dict

country_dict = get_accessions_by_country(fixed_country_df)

def make_tree(tree, country_dict):
    mpl.rc('font', family='sans-serif')
    mpl.rc('font', serif='Helvetica')
    mpl.rc('text', usetex='false')
    mpl.rcParams.update({'font.size': 6})
    mpl.rcParams['font.weight'] = 'light'
    
    fig = plt.figure(figsize=(7.5, 10), facecolor='w')
    gs = gridspec.GridSpec(1, 1, wspace=0.0)
    ax = plt.subplot(gs[0], facecolor='w')

    ### MAP COLORS
    colors = ['#5778a4', '#e49444', '#d1615d', '#a87c9f', '#e7ca60']
    color_mapping = {country: colors[i % len(colors)] for i, country in enumerate(country_dict.keys())}
    color_func = lambda k: color_mapping[next(country for country, ids in country_dict.items() if k.name in ids)]

    ### DRAW TREE
    tree.drawTree()
    tree.plotTree(ax, colour='black', width=1, connection_type='baltic')
    tree.plotPoints(ax, colour=color_func, size=40, zorder=4)

    ### ADD LEAF TEXT
    x_attr = lambda k: k.x+0.001
    kwargs={'va':'center','ha':'left'} ## kwargs for text
    #tree.addText(ax,x_attr=x_attr,**kwargs)

    ### SET MATPLOTLIB INFO
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_yticklabels([])
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)

    ### MAKE COUNTRY COLOR LEGEND
    legend_patches = [mpatches.Patch(color=color, label=country) for country, color in color_mapping.items()]
    legend = ax.legend(handles=legend_patches, loc='upper left', bbox_to_anchor=(1, 1), frameon=False, fontsize=6)
    plt.tight_layout()
    plt.savefig(output_img,dpi=300)
    plt.show()
    
make_tree(tree,country_dict)

	Accession ID	Country
0	NC_002728.1	None
1	OM135495.1	India: Kerala State
2	MW535746.1	Thailand
3	MK673558.1	Malaysia
4	MK673559.1	Malaysia
...	...	...
74	AJ564622.1	None
75	AJ564623.1	None
76	AY029767.1	Malaysia
77	AY029768.1	Malaysia
78	AF212302.2	None

make_nipah_phylogeny_baltic¶