Calculate CAZome sizes using cazomevolve.explore
Calculate the size of the CAZome in each genome, as well as: * The number of CAZy families * Proportion of the proteome encapulsated by the CAZome * The CAZy family to CAZyme ratio
Import from cazomevolve.cazome.explore.cazome_sizes.
def count_items_in_cazome(gfp_df, item, grp, round_by=None):
"""Count the number of unique items per genome and per specificed tax grouping
:param gfp_df: panda df, cols = ['Family', 'Genome', 'Protein', 'tax grp', 'tax grp'...]
:param item: str, name of column to calculate incidence for, e.g. 'Protein' or 'Family'
:param grp: str, name of column to group genomes by
:param round_by: int, number of figures to round mean and sd by. If None do not round
Return
* dict of {grp: {genome: {'items': {items}, 'numOfItems': int(num of items)}}}
* df, cols = []
"""
cazome_sizes = {} # {genus: {genome: {'proteins': unique prot acc, 'numOfcazymes': int(num of prots)}}}
for ri in tqdm(range(len(gfp_df)), desc="Gathering CAZy families per genome"):
row = gfp_df.iloc[ri]
genome = row['Genome']
row_item = row[item]
grp_name = row[grp]
try:
cazome_sizes[grp_name]
except KeyError:
cazome_sizes[grp_name] = {}
try:
cazome_sizes[grp_name][genome][f'{item}s'].add(row_item)
except KeyError:
cazome_sizes[grp_name][genome] = {f"{item}s": {row_item}}
cazyme_size_data = []
for grp_name in tqdm(cazome_sizes, desc=f"Calculating num of {item} per genome and per {grp}"):
for genome in cazome_sizes[grp_name]:
num_of_items = len(cazome_sizes[grp_name][genome][f'{item}s'])
cazome_sizes[grp_name][genome][f'numOf{item}s'] = num_of_items
num_of_items = []
for genome in cazome_sizes[grp_name]:
num_of_items.append(cazome_sizes[grp_name][genome][f'numOf{item}s'])
mean_items = np.mean(num_of_items)
sd_items = np.std(num_of_items)
if round_by is not None:
mean_items = mean_items.round(decimals = round_by)
sd_items = sd_items.round(decimals = round_by)
num_genomes = len(list(cazome_sizes[grp_name].keys()))
new_row = [grp_name, mean_items, sd_items, num_genomes]
cazyme_size_data.append(new_row)
cols = [grp, f'Mean{item}s', f'Sd{item}s', 'NumOfGenomes']
cazome_size_df = pd.DataFrame(cazyme_size_data, columns=cols)
return cazome_sizes, cazome_size_df
def get_proteome_sizes(proteome_dir, gfp_df, grp):
"""Count the number of proteins in the proteome file of each genome
Build a dict of proteome sizes grouped by tax lineage ('grp')
:param proteome_dir: Path or str, path to dir containing .faa proteome files
:param gfp_df: pandas df containing families, genomes, tax_rank, tank_rank...
:param grp: str, name of column (tax_rank) to group genomes by, e.g. 'Genus'
Return dict {grp: {genome: {'numOfproteins': int()}}}"""
proteome_files = get_file_paths(proteome_dir, suffixes=['.faa'])
proteome_sizes = {} # {grp: {genome: {'numOfproteins': int()}}}
for fasta_path in tqdm(proteome_files, desc="Getting proteome sizes"):
try:
genome = re.findall(r"GCF_\d+\.\d{1,5}", fasta_path.name)[0]
except IndexError:
try:
genome = re.findall(r"GCA_\d+\.\d{1,5}", fasta_path.name)[0]
except IndexError:
print(
f"Could not retrieve genomic accession from\n{fasta_path}\n"
"Skipping FASTA file"
)
continue
# get tax group
tax_row = gfp_df[gfp_df['Genome'] == genome]
if len(tax_row) == 0:
print(
f"Genome {genome} was not in the FGP file."
"Therefore, skipping proteome proporiton count for genome"
)
continue
tax_group = tax_row[grp].values[0]
num_of_proteins = 0
for record in SeqIO.parse(fasta_path, 'fasta'):
num_of_proteins += 1
try:
proteome_sizes[tax_group]
except KeyError:
proteome_sizes[tax_group] = {}
proteome_sizes[tax_group][genome] = {'numOfProteins': num_of_proteins}
return proteome_sizes
def calc_proteome_representation(proteome_dict, cazome_sizes_dict, grp, round_by=None):
"""Calculate the percentage of the proteome represented by the CAZome per genome
and the mean per tax group ('grp')
:param proteome_dict: dict {grp: {genome: {'numOfproteins': int()}}}
:param cazome_sizes_dict: dict {grp: {genome: {'Proteins': {prots}, 'numOfProteins': int(num of prots)}}}
:param grp: str, name of column (tax_rank) to group genomes by, e.g. 'Genus'
:param round_by: int, num of dp to round means and sd to, if none does not round
Return
* df with columns [grp, 'MeanProteomeSize', 'SdProteomeSize', 'MeanProteomePerc', 'SdProteomePerc', 'NumOfGenomes']
"""
proteome_perc_dict = {} # {grp_name: {genome: {'numOfProteins': int(num prot ids)}}}
for grp_name in tqdm(cazome_sizes_dict, desc='Getting proteome size'):
for genome in cazome_sizes_dict[grp_name]:
# gather num of proteins in CAZome and in the proteome
cazome_size = cazome_sizes_dict[grp_name][genome]['numOfProteins']
try:
proteome_size = proteome_dict[grp_name][genome]['numOfProteins']
except KeyError:
print(f"Not proteome size available for {genome}.\nSkipping this genome")
continue
percentage = (cazome_size / proteome_size) * 100
# check if grp name is in dict
try:
proteome_perc_dict[grp_name]
except KeyError:
proteome_perc_dict[grp_name] = {}
# add percentage of the proteome to the dict
# round only when calc mean
proteome_perc_dict[grp_name][genome] = percentage
# calculate the mean proteome size and percentage per tax group (grp)
# cazome_sizes_dict = {grp: {genome: {'Proteins': {prots}, 'numOfProteins': int(num of prots)}}}
df_data = [] # [[grp_name, mean proteome size, sd proteome size, mean perc, sd perc]]
for grp_name in tqdm(proteome_perc_dict, desc='Calc proteome perc'):
# gather proteome sizes and perc for the grp
grp_proteome_size = []
grp_proteome_perc = []
for genome in proteome_perc_dict[grp_name]:
grp_proteome_size.append(proteome_dict[grp_name][genome]['numOfProteins'])
grp_proteome_perc.append(proteome_perc_dict[grp_name][genome])
# calc means and sd
mean_proteome = np.mean(grp_proteome_size)
sd_proteome = np.std(grp_proteome_size)
mean_perc = np.mean(grp_proteome_perc)
sd_perc = np.std(grp_proteome_perc)
if round_by is not None:
mean_proteome = round(mean_proteome, round_by)
sd_proteome = round(sd_proteome, round_by)
mean_perc = round(mean_perc, round_by)
sd_perc = round(sd_perc, round_by)
num_of_genomes = len(list(proteome_perc_dict[grp_name].keys()))
df_data.append(
[grp_name, mean_proteome, sd_proteome, mean_perc, sd_perc, num_of_genomes]
)
col_names = [grp, 'MeanProteomeSize', 'SdProteomeSize', 'MeanProteomePerc', 'SdProteomePerc', 'NumOfGenomes']
df = pd.DataFrame(df_data, columns=col_names)
return df
def count_cazyme_fam_ratio(fgp_df, grp, round_by=None):
"""Calculate the mean (and SD) CAZyme to CAZy family ratio across the genomes for each group e.g. genus
:param fgp_df: panda df, cols = ['Family', 'Genome', 'Protein', 'tax grp', 'tax grp'...]
:param grp: str, name of column to group genomes by
:param round_by: int, number of figures to round mean and sd by. If None do not round
Return
* dict of {grp: {genome: {'items': {items}, 'numOfItems': int(num of items)}}}
* df, cols = []
"""
cazome_sizes = {} # {genus: {genome: {'proteins': unique prot acc, 'numOfcazymes': int(num of prots)}}}
for ri in tqdm(range(len(fgp_df)), desc="Gathering CAZymes and CAZy families per genome"):
row = fgp_df.iloc[ri]
genome = row['Genome']
fam = row['Family']
protein = row['Protein']
grp_name = row[grp]
try:
cazome_sizes[grp_name]
except KeyError:
cazome_sizes[grp_name] = {}
try:
cazome_sizes[grp_name][genome]['Proteins'].add(protein)
cazome_sizes[grp_name][genome]['Families'].add(fam)
except KeyError:
cazome_sizes[grp_name][genome] = {'Proteins': {protein}, 'Families': {fam}}
cazyme_ratio_data = []
for grp_name in tqdm(cazome_sizes, desc=f"Calculating CAZyme/CAZy family ratio"):
for genome in cazome_sizes[grp_name]:
num_of_cazymes = len(cazome_sizes[grp_name][genome]['Proteins'])
num_of_families = len(cazome_sizes[grp_name][genome]['Families'])
cazyme_fam_ratio = num_of_cazymes / num_of_families
cazome_sizes[grp_name][genome]['ratio'] =cazyme_fam_ratio
ratios = []
for genome in cazome_sizes[grp_name]:
ratios.append(cazome_sizes[grp_name][genome]['ratio'])
mean_ratio = np.mean(ratios)
sd_ratio = np.std(ratios)
if round_by is not None:
mean_ratio = mean_ratio.round(decimals = round_by)
sd_ratio = sd_ratio.round(decimals = round_by)
num_genomes = len(list(cazome_sizes[grp_name].keys()))
new_row = [grp_name, mean_ratio, sd_ratio, num_genomes]
cazyme_ratio_data.append(new_row)
cols = [grp, 'MeanCAZymeToFamRatio', 'SdCAZymeToFamRatio', 'NumOfGenomes']
cazome_ratio_df = pd.DataFrame(cazyme_ratio_data, columns=cols)
return cazome_sizes, cazome_ratio_df