Parse data

Prior to the exploratory analysis of CAZomes, all required data must be loaded into the memeory and augmented in to data structures that can be handled by cazomevolve.
The following functions are for loading and parsing data aurgment by cazomevolve:
Import from cazomevolve.cazome.explore.parse_data.
def load_fgp_data(fgp_file):
    """Load data from the fgp (fam - genome - protein) tab delimited file from cazomevolve.

    :param fgp_file: str/Path, path to fgp_file

    Return pandas df with columns 'Family', 'Genome' and 'Protein' (containing the protein id/accession)
    """
    fgp_df = pd.read_table(fgp_file, header=None)
    fgp_df.columns = ['Family', 'Genome', 'Protein']

    return fgp_df


def add_tax_column_from_row_index(df, tax_rank, tax_index):
    """Add a column listing the tax for each row in a family frequency df
    where the tax info is in the row index

    :param df: df to add tax data to
    :param tax_rank: str, name of the tax rank, to be used as new column name
    :param tax_index: int, index of tax info in the row name/index to be added to the new column

    Return df with a new column add
    """
    new_col = []

    for i in range(len(df)):
        tax = df.iloc[i].name[tax_index]
        new_col.append(tax)

    df[tax_rank] = new_col

    return df


def load_tax_data(
    tax_csv_path,
    kingdom=False,
    phylum=False,
    tax_class=False,
    tax_order=False,
    tax_family=False,
    genus=False,
    species=False,
):
    """Load tax data compiled by cazomevolve into a pandas df

    :param tax_csv_path: str/Path to csv file of genome, tax_rank, tax_rank
        e.g. 'Genome', 'Genus', 'Species'
    The remaining params are bool checks for lineage ranks included in the tax data file

    Return df of genome, tax_rank, tax_rank,  e.g. 'Genome', 'Genus', 'Species'
    """
    tax_df = pd.read_csv(tax_csv_path)

    col_names = ['Genome']
    if kingdom:
        col_names.append('Kingdom')
    if phylum:
        col_names.append('Phylun')
    if tax_class:
        col_names.append('Class')
    if tax_order:
        col_names.append('Order')
    if genus:
        col_names.append('Genus')
    if species:
        col_names.append('Species')

    try:
        tax_df = tax_df.drop('Unnamed: 0', axis=1)  #'Unnamed: 0' from pandas indexes
    except:
        pass

    for colname in tax_df.columns:
        if colname not in col_names:
            tax_df = tax_df.drop(colname, axis=1)

    return tax_df


def add_tax_data_from_tax_df(
    df,
    tax_df,
    kingdom=False,
    phylum=False,
    tax_class=False,
    tax_order=False,
    tax_family=False,
    genus=False,
    species=False,
):
    """Extract tax data from the tax df and add to the df (e.g. the gfp_df)

    :param df: pandas df, df to add tax data to
    :param df: pandas df containing tax data, with one column called 'Genome'
        and one column per tax rank
    The remaining params are bool checks for lineage ranks to be added to
    the df

    Return df with new taxonomy columns
    """
    tax_ranks = []
    if kingdom:
        tax_ranks.append('Kingdom')
    if phylum:
        tax_ranks.append('Phylun')
    if tax_class:
        tax_ranks.append('Class')
    if tax_order:
        tax_ranks.append('Order')
    if genus:
        tax_ranks.append('Genus')
    if species:
        tax_ranks.append('Species')
    if len(tax_ranks) == 0:
        print('No tax ranks listed to be added to df')
        return df

    for tax_rank in tax_ranks:
        new_col = []
        for ri in tqdm(range(len(df)), desc=f"Collecting {tax_rank} data"):
            # retrieve the row in the tax_df containing the corresponding genome information
            tax_row = tax_df[tax_df['Genome'] == df.iloc[ri]['Genome']]
            new_col.append(tax_row[tax_rank].values[0])
        df[tax_rank] = new_col

    return df


def get_dbcan_fams_data(dbcan_dir, fam_g_path, fam_g_p_path):
    """Retrieve all CAZy families predicted by dbCAN and their frequencies.

    Removes EC numbers, domain ranges and subfams (retains fam of the subfam)
    using parse_dbcan_tool().
    Writes out tab delimited lists using write_tab_files().

    :param dbcan_dir: Path, path to dir containing all dbCAN output dirs
    :param fam_g_path: Path, path to write out fam-genome tab delimited list
    :param fam_g_p_path: Path, path to write out fam-genome-protein tab delimited list

    Return
    * all_fams: a set of all CAZy families
    * fam_freq: Dict {genomic acc: Counter(cazy fam)}
    * cazome_sizes: Dict {genomic_acc: {'CAZymes': (num of CAZymes (num unique protein acc))}}
    """
    # get paths to all dbCAN output dirs
    dbcan_dir_paths = get_dir_paths(dbcan_dir)

    all_fams = set()
    fam_freqs = {}  # genomic acc: Counter objects
    cazome_sizes = {}  # genome: {cazymes: int}

    for dir_path in tqdm(dbcan_dir_paths, desc="Parsing dbCAN output files"):
        genomic_acc = dir_path.name.split("_")[0] + '_' + dir_path.name.split("_")[1]
        overview_file = dir_path / "overview.txt"

        try:
            dbcan_df = pd.read_table(overview_file).drop("EC#", axis=1)
        except FileNotFoundError:
            print(f"Could not find overview.txt file for {genomic_acc}")
            continue

        # drop rows were num of tools is 1
        dbcan_df = dbcan_df[dbcan_df["#ofTools"] != 1]

        # drop domain ranges, EC numbers and subfamilies
        for tool in ['HMMER', 'eCAMI', 'DIAMOND']:
            dbcan_df = parse_dbcan_tool(dbcan_df, tool)

        # get the consensus predictions
        dbcan_df = get_consensus(dbcan_df)

        # get the fam freqs and all cazy fams annotated in the genome
        genome_fam_freqs, genome_fams = get_fam_freq_df(dbcan_df)
        all_fams = all_fams.union(genome_fams)
        fam_freqs[genomic_acc] = genome_fam_freqs
        cazome_sizes[genomic_acc] = {'CAZymes': len(set(dbcan_df['Gene ID']))}

    return all_fams, fam_freqs, cazome_sizes


def parse_dbcan_tool(df, tool, disable=True):
    """Parse the output for a tool in dbCAN.

    Remove domain ranges, EC numbers and CAZy subfamilies.

    :param dbcan_path: Path, path to dbCAN overview.txt file
    :param tool: str, name of tool - col name in df
    :param disable: bool, whether to disable the tqdm p-bar

    Return dataframe with new col added"""
    new_col_data = []
    current_col_data = df[tool]

    for row in tqdm(current_col_data, desc=f"Parsing {tool} output", disable=disable):
        row_data = row.split("+")
        row_fams = ""
        for data in row_data:
            fam = data.split("(")[0]
            fam = fam.split("_")[0]
            if fam.startswith(('G', 'P', 'C', 'A')):
                row_fams += f"{fam}+"
        new_col_data.append(row_fams)

    df[f'parsed_{tool}'] = new_col_data

    return df


def get_consensus(df, disable=True):
    """Get the consensus CAZy family classifications

    i.e. families that at least two tools agree upon

    :param df: pandas df, dbcan output
    :param disable: bool, whether to disable the tqdm p-bar

    Return df with new column = consensus
    """
    consensus_col = []

    for ri in tqdm(range(len(df)), desc="Getting dbCAN consensus", disable=disable):
        row = df.iloc[ri]
        hmmer = set(row[f'parsed_HMMER'].split("+"))
        ecami = set(row[f'parsed_eCAMI'].split("+"))
        diamond = set(row[f'parsed_DIAMOND'].split("+"))

        all_consen = list(hmmer & ecami & diamond)
        hm_ecam = list(hmmer & ecami)
        hm_dia = list(hmmer & diamond)
        ecam_dia = list(ecami & diamond)

        consensus = list(set(all_consen + hm_ecam + hm_dia + ecam_dia))
        consen_data = ""
        for fam in consensus:
            # check if empty str is included
            if len(fam) > 0:
                consen_data += f"{fam}+"

        consensus_col.append(consen_data)

    df['Consensus'] = consensus_col

    return(df)


def get_fam_freq_df(df):
    """Get the frequencies of CAZy families"""
    consensus_data =  [row.split("+") for row in df['Consensus']]
    all_fams = []

    for data in consensus_data:
        for fam in data:
            if len(fam) > 0:
                all_fams.append(fam)

    fam_freqs = Counter(all_fams)
    all_fams = set(all_fams)

    return fam_freqs, all_fams


def build_fam_freq_df(all_fams, fam_freqs):
    """Build a wide df of CAZy fam freqs per genome

    :param all_fams: set, all CAZy families found across all genomes
    :param fam_freqs: dict, {genomic_acc: Counter( cazy families )}

    Return df, rows = genomes, cols = cazy fam freq
    """
    all_fams = list(all_fams)
    all_fams.sort()

    fam_freq_data = []

    for genomic_acc in tqdm(fam_freqs, desc="Build wide df"):
        new_data = [genomic_acc]
        for fam in all_fams:
            try:
                new_data.append(fam_freqs[genomic_acc][fam])
            except KeyError:
                new_data.append(0)

        fam_freq_data.append(new_data)

    col_names = ['Genome']
    col_names += all_fams

    fam_freq_df = pd.DataFrame(fam_freq_data, columns=col_names)

    return fam_freq_df


def index_df(fam_freq_df):
    """Index the necessary columns for further analyses in cazomevolve

    :param fam_freq_df: df, row=genome, col=family

    Return fam freq df with indexed columns for row names"""
    indexed_df = fam_freq_df.set_index(['Genome', 'Genus', 'Species'])
    return indexed_df


def add_grps_col(df, group_by):
    """Add a column identify each grp for each row, e.g. genus or species

    This func should by run before get_grps_cooccurring_fams() and the pca()
    For the PCA this allows the genomes to be coloured by their group
    (i.e. genus or species)

    :param df: df, rows=genomes, cols=fam freqs = fam_freq_df
    :param group_by: str, 'genus' or 'species'

    Return df with new grp column
    """
    if group_by == 'genus':
        group_num = 1
    else:
        group_num = 2

    grps = []
    for ri in tqdm(range(len(df)), desc="Identifying groups in fam freq df"):
        grp = df.iloc[ri].name[group_num].strip()
        grp = f"{grp[0].upper()}{grp[1:]}"
        grps.append(grp)

    grp_df = copy(df)
    grp_df[f"{group_by[0].upper()}{group_by[1:]}"] = grps

    return grp_df