Identify CAZy families that always co-occurr together

Identify CAZy families that always appear together in a set of genomes, although the group of always co-occurring families may not be present in every genome.

Correlation matrix

One approach to identify always co-occurring families is to build a correlation matrix from the CAZy family frequencies.

Import from cazomevolve.cazome.explore.cooccurring_families.

def identify_cooccurring_fams_corrM(df, all_families, core_cazome=[], corrM_path=None, fill_value=2):
    """Build a correlation matrix of the CAZy fam frequencies to identify co-occurring families
    i.e. CAZy families that are always present together

    :param df: fam freq df, pandas df, columns are CAZy families, rows are genomes, cells
        contain CAZy fam frequency
    :param all_families: set of all CAZy families to be analysed
    :param core_cazome: list of CAZy families in the core cazome, provide no list if you want to
        include the core CAZome
    :param corrM_path: path/str to write out the correlation matrix, if none, correlation matrix not
        written to a CSV file
    :param fill_value: int, value to replace Nan values in correlation matrix, if None - deletes columns
        and rows containing Nan values

    Return set of tuples, one tuple per group of co-occuring families
    and the filled correlation matrix
    """
    # convert CAZy fam freq df to binary presence/absence
    binary_fam_df = copy(df)

    for fam in tqdm(all_families, desc="Building binary fam freq df"):
        binary_fam_df.loc[binary_fam_df[fam] > 1, fam] = 1

    for fam in tqdm(all_families, desc="Delete absent families"):
        if 1 not in list(binary_fam_df[fam]):
            binary_fam_df.drop(fam, axis=1)

    for fam in core_cazome:
        binary_fam_df.drop(fam, axis=1)

    # correlation matrix
    fam_corr_M = binary_fam_df[all_families].corr()

    # write out a CSV file
    if corrM_path is not None:
        fam_corr_M.to_csv(corrM_path)

    if fill_value is None:
        fams_to_del = set()
        for fam in list(fam_corr_M.columns):
            for value in list(fam_corr_M[fam]):
                if str(value) == 'nan':
                    fams_to_del.add(fam)

        for fam in fams_to_del:
            fam_corr_M.drop(fam, axis=1)
            fam_corr_M.drop(fam, axis=0)

    else:
        fam_corr_M_filled = fam_corr_M.fillna(fill_value)

    cooccurring_families = set()
    for fam in tqdm(all_families, desc="Identifying always co-occurring families"):
        if 1 not in list(fam_corr_M_filled[fam]):
            continue

        # find rows were value is 1
        value_1_rows = fam_corr_M_filled[fam_corr_M_filled[fam] == 1]
        if len(value_1_rows) == 1:
            continue

        fams = list(value_1_rows.index)
        fams.sort()
        fams = tuple(fams)
        cooccurring_families.add(fams)

    return cooccurring_families, fam_corr_M_filled

Iterative method

Alternatively, cazomevovle can iterate through a dataframe of CAZy family frequencies (see the cazomevolve.cazome.explore.cazy_families submodule) to identify CAZy families that always co-occurr in a genome together.

Import from cazomevolve.cazome.explore.cooccurring_families.

def calc_cooccuring_fam_freqs(df, all_families, exclude_core_cazome=False):
    """Identify groups of CAZy families that are always present together, and count in
    how many genomes the families are present together

    Initially calls another function to identify pairs of CAZy families that are always
    present together.

    Then identifies overlapping pairs of co-occurring CAZy families to identify grps
    of co-occurring families, because if fam1 is always present with fam2, {fam1, fam2}
    and fam3 is always present with fam1 {fam1, fam3} then fam2 and fam3 must always
    be present together because both are always present with fam1.

    :param df: fam freq df, pandas df, columns are CAZy families, rows are genomes, cells
        contain CAZy fam frequency
    :param all_families: set of all CAZy families to be analysed
    :param exclude_core_cazome: whether to exlude the core cazome, default: False -
        include the core CAZome

    Return dict {grp_num: {'fams': {co-occurring fams}, 'freqs': {num of genomes}}
    - returns set of frequencies in case different numbers are produced for each inital pair
    of co-occurring families
    """
    cooccuring_fams_dict = identify_cooccurring_fam_pairs(df, all_families, exclude_core_cazome=exclude_core_cazome)

    cooccurring_groups = {}
    grp_num = 0

    for cofams in tqdm(cooccuring_fams_dict, desc='Combining pairs of co-occurring families'):
        fams = cooccuring_fams_dict[cofams]['fams']

        added = False
        for grp in cooccurring_groups:
            if fams[0] in cooccurring_groups[grp]['fams']:
                cooccurring_groups[grp]['fams'].add(fams[1])
                cooccurring_groups[grp]['freqs'].add(cooccuring_fams_dict[cofams]['freq'])
                added = True

        if added is False:
            for grp in cooccurring_groups:
                if fams[1] in cooccurring_groups[grp]['fams']:
                    cooccurring_groups[grp]['fams'].add(fams[0])
                    cooccurring_groups[grp]['freqs'].add(cooccuring_fams_dict[cofams]['freq'])
                    added = True

        if added is False:
            cooccurring_groups[grp_num] = {
                'fams': {fams[0], fams[1]},
                'freqs': {cooccuring_fams_dict[cofams]['freq']}
            }
            grp_num+=1

    for grp in cooccurring_groups:
        if len(cooccurring_groups[grp]['freqs']) > 1:
            print(f':WARNING: differing freqs found for grp: {cooccurring_groups[grp]}')

    return cooccurring_groups

Build an upset plot

To help visualise differences in the presence of always co-occurring CAZy families, the co-occurring CAZy families can be plotted onto an upset plot.

All functions are imported from cazomevolve.cazome.explore.cooccurring_families.

First build the upset plot membership:

def add_to_upsetplot_membership(upsetplot_membership, cooccurring_fams_dict):
    """Add co-occurring families to upsetplot membership data

    :param upsetplot_membership: list of lists, one nested list per instance of co-occurring families group
    :param cooccurring_fams_dict, dict {grp_num: {'fams': {families}: 'freqs': {ints(num of genomes)}}}

    Return updated list of upsetplot membership
    """
    for grp_num in cooccurring_fams_dict:
        families = list(cooccurring_fams_dict[grp_num]['fams'])
        freq = list(cooccurring_fams_dict[grp_num]['freqs'])[0]
        for i in range(freq):
            upsetplot_membership.append(families)
    return upsetplot_membership

Then build the upset plot:

def build_upsetplot(
    upsetplot_membership,
    file_path=None,
    file_format='svg',
    sort_by='degree',
    sort_categories_by='cardinality',
):
    """Use the upsetplot package to build an upsetplot of co-occurring families

    :param upsetplot_membership: list of lists, one nested list per instance of co-occurring families group
    :param file_path, str/Path, path to write out figure. If none, file is not written out
    :param file_format: str, format to write out file, e.g. svg or png, default, svg
    :param sort_by: str, method to sort subsets
        From Upsetplot:
            sort_by : {'cardinality', 'degree', '-cardinality', '-degree',
                    'input', '-input'}
                If 'cardinality', subset are listed from largest to smallest.
                If 'degree', they are listed in order of the number of categories
                intersected. If 'input', the order they appear in the data input is
                used.
                Prefix with '-' to reverse the ordering.

                Note this affects ``subset_sizes`` but not ``data``.
    :param sort_categories_by: str,
        From UpsetPlot:
            sort_categories_by : {'cardinality', '-cardinality', 'input', '-input'}
                Whether to sort the categories by total cardinality, or leave them
                in the input data's provided order (order of index levels).
                Prefix with '-' to reverse the ordering.

    Return upsetplot
    """
    upset_data = upsetplot.from_memberships(upsetplot_membership)
    coocurring_upset_plot = upsetplot.UpSet(
        upset_data,
        subset_size='sum',
        sort_categories_by=sort_categories_by,
        sort_by=sort_by,
    )
    coocurring_upset_plot.plot();

    if file_path is not None:
        plt.savefig(file_path, format=file_format)

    return coocurring_upset_plot

The upset plot plots a bar chart of the total frequency or incidence of each group of always co-occurring CAZy families.

cazomevolve can break down the incidence by a user defined delineation, e.g. per species or per genus.

First retrieve the groups from the upset plot membership:

def get_upsetplot_grps(upsetplot_membership):
    """Retrieve the groups of CAZy families in the upset plot, in the order they are presented in the plot.

    :param upsetplot_membership: list of lists, membership data used to build the upset plot

    Return list of lists, one nested list per group of co-occurring CAZy families
    """
    upset_data = upsetplot.from_memberships(upsetplot_membership)

    upsetplot_grp_data = upsetplot.query(
        upset_data,
        subset_size='count',
        sort_by='degree',
    )

    # extract the presence/absence data of each CAZy family per group of families in the upsetplot
    # convert series to a df, personally easier to handle and visualise
    upset_plot_df = pd.DataFrame(upsetplot_grp_data.subset_sizes)
    # upsetplot_df contains one row per group in the upset plot
    # going down the df, shows each group left to right on the upset plot

    upset_plot_groups = []

    upsetplot_df_fams = list(upset_plot_df.index.names)

    for ri in tqdm(range(len(upset_plot_df))):
        # the name of each row is a tuple of boolean values, one value per fam
        # marking if fam in group or not
        grp = []

        for i in range(len(upsetplot_df_fams)):
            if upset_plot_df.iloc[ri].name[i]: # is True
                grp.append(upsetplot_df_fams[i])

        upset_plot_groups.append(grp)

    return upset_plot_groups

Then retrieve the incidence per CAZy family group per group (e.g. genus or species), and then build a dataframe with this data. The dataframe can then be used in another tool to build a proporitonal area plot, e.g. using RawGraphs.

def add_upsetplot_grp_freqs(
    upset_plt_groups,
    cooccurring_grp_freq_data,
    cooccurring_fam_dict,
    grp,
    grp_sep=False,
    grp_order=None,
    include_none=False,
):
    """Add data on the incidence of co-occurring grps of CAZy families from the
    cooccurring_fam_dict to cooccurring_grp_freq_data

    :param upset_plt_groups: list of lists, one nested list per grp of co-occurring CAZy families
        grps listed in same order as present in the upsetplot
    :param cooccurring_grp_freq_data: list of lists, one nested list per
        pair of 'grp' and grp of co-occurring CAZy families
    :param cooccurring_fam_dict: dict, {grp_num: {'fams': {families}, 'freqs': {freqs/incidences}}}
    :param grp: str, name of grp to be added to cooccurring_grp_freq_data, e.g. the name of the genus
    :param grp_sep: bool, does the cooccurring_fam_dict contain data separated into grps, e.g. by genus
        {grp(e.g. genus): {grp_num: {'fams': {families}, 'freqs': {freqs/incidences}}}}
    :param grp_order: list of grp names, order to list through grps if grp_sep is True. If None, uses
        order groups are listed in cooccurring_fam_dict
    :param include_none: bool, if True, if a grp of fams is not in the cooccurring_fam_dict, leaves the
        freq as None. If false, the grp of fams is not added to upset_plt_groups

    Return cooccurring_grp_freq_data
    """

    for fam_grp in tqdm(upset_plt_groups, desc="Compiling co-occurring families incidence data"):

        if grp_sep:
            if grp_order is None:
                for grp_name in cooccurring_fam_dict:
                    added = False
                    for grp_num in cooccurring_fam_dict[grp_name]:
                        if cooccurring_fam_dict[grp_name][grp_num]['fams'] == set(fam_grp):
                            cooccurring_grp_freq_data.append(
                                [
                                    "+".join(fam_grp),
                                    grp_name,
                                    list(cooccurring_fam_dict[grp_name][grp_num]['freqs'])[0],
                                ]
                            )
                            added = True

                    if (include_none) and (added is False):
                        cooccurring_grp_freq_data.append(
                            [
                                "+".join(fam_grp),
                                grp_name,
                                None,
                            ]
                        )

            else:
                for grp_name in grp_order:
                    added = False
                    for grp_num in cooccurring_fam_dict[grp_name]:
                        if cooccurring_fam_dict[grp_name][grp_num]['fams'] == set(fam_grp):
                            cooccurring_grp_freq_data.append(
                                [
                                    "+".join(fam_grp),
                                    grp_name,
                                    list(cooccurring_fam_dict[grp_name][grp_num]['freqs'])[0],
                                ]
                            )
                            added = True

                    if (include_none) and (added is False):
                        cooccurring_grp_freq_data.append(
                            [
                                "+".join(fam_grp),
                                grp_name,
                                None,
                            ]
                        )

        else:
            # get the data for the relevant group
            added = False
            for grp_num in cooccurring_fam_dict:
                if cooccurring_fam_dict[grp_num]['fams'] == set(fam_grp):
                    cooccurring_grp_freq_data.append(
                        [
                            "+".join(fam_grp),
                            grp,
                            list(cooccurring_fam_dict[grp_num]['freqs'])[0],
                        ]
                    )
                    added = True

            if (include_none) and (added is False):
                cooccurring_grp_freq_data.append(
                    [
                        "+".join(fam_grp),
                        grp,
                        None,
                    ]
                        )

    return cooccurring_grp_freq_data


def build_upsetplot_matrix(cooccurring_grp_freq_data, grp, file_path=None):
    """Build matrix of grp of CAZy families, grp of interest name (e.g. genus) and incidence
    (i.e. the number of genomes that the grp of CAZy families appeared in)

    :param cooccurring_grp_freq_data: list of lists, one nested list per row in the df
    :param grp: str, name of grouping, i.e. the method used to group the genomes,
        .e.g. 'Genus', or 'Species'
    :param file_path: str/Path, path to write out CSV file. If none, the file is not
        written to file

    Return df
    """
    df = pd.DataFrame(cooccurring_grp_freq_data, columns=[
        'Families',
        grp,
        'Incidence',
    ])

    if file_path is not None:
        df.to_csv(file_path)

    return df