Perform Principal Component Analysis
Principal Component Analysis (PCA) is a statistical a method of dimensional reduction that converts a highly dimensional dataset into a low dimensional data set, while retaining as much information as possible from the original data.
Dimensional reduction is achieved by defining directions in the data long which variation is maximal. These directions are called principal components or PCs.
Run PCA
Import from cazomevolve.cazome.explore.pca.
def perform_pca(df, nComp):
"""Perform PCA on family freq df
:param df: df, rows=genomes, cols=fam freqs
Only contains columns with CAZy family frequency data
Recommend placing the tax data in the index or leaving out
:param nComp: int, number of components
Return PCA object and object for scaling PCA
"""
# scale the data
scaler = StandardScaler()
scaler.fit(df.loc[:, df.columns])
X_scaled = scaler.transform(df.loc[:, df.columns])
cazome_pca = PCA(n_components=nComp)
cazome_pca.fit(X_scaled)
return cazome_pca, X_scaled
Evaluate the PCA
Calculate how much of the variation in the original dataset is captured by the PCA.
Import from cazomevolve.cazome.explore.pca.
def plot_explained_variance(
pca,
nComp,
threshold=0.95,
figsize=(10, 5),
file_path=None,
file_format='png',
dpi=300,
):
"""Plot the cumlative explained variance.
:param pca: sklearn pca object
:param nComp: int, number of PCs
:param threshold: int (float), calc the number of PCs needed to capture this degree of the variance
in the dataset
:param file_path: str/Path, path to write out figure. If none, image is not written to a file
:param file_format: str, format to write out the image. Default, png
:param dpi: int, resolution for saved figure
Retrun numpy array with the explained variance per PC
"""
cumExpVar = np.cumsum(pca.explained_variance_ratio_)
keepPC = [pc for pc in range(nComp) if cumExpVar[pc] >= threshold][0]
print(
'Number of features needed to explain {:1.2f} fraction of total variance is {:2d}. '.format(threshold, keepPC)
)
fig = plt.figure(figsize=figsize)
im = plt.plot( range(nComp), cumExpVar )
plt.xticks(np.arange(0,nComp,5));
plt.xlabel( 'Number of PCs', fontsize=14);
plt.ylabel('Cumulative explained variance', fontsize=14);
plt.show;
if file_path is not None:
plt.savefig(
file_path,
bbox_inches='tight',
format=file_format,
dpi=dpi,
)
return cumExpVar
Calculate the amount of variation captured by each PC, individually.
def plot_scree(pca, nComp=10, file_path=None, file_format='png', dpi=300):
"""Generate scree plot for PCA, plotting the amount of variance captured by each pc, for the
first nComp PCs
:param pca: sklearn pca object
:param nComp: int, number of PCs to plot
:param file_path: str/Path, path to write out figure. If none, image is not written to a file
:param file_format: str, format to write out the image. Default, png
:param dpi: int, resolution for saved figure
Return nothing
"""
PC_values = np.arange(nComp) + 1
plt.plot(PC_values, pca.explained_variance_ratio_[0:nComp], 'o-', linewidth=2, color='blue')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
if file_path is not None:
plt.savefig(
file_path,
bbox_inches='tight',
dpi=dpi,
)
plt.show();
for i in range(nComp):
print(f"Explained variance for {i+1}PC: {pca.explained_variance_ratio_[i]}")
Plot PCA
Project genomes onto user defined pairs of PCs.
Import from cazomevolve.cazome.explore.pca.
def plot_pca(
pca,
X_scaled,
fam_df,
first_pc,
second_pc,
group_by,
file_path=None,
style=None,
style_order=None,
hue_order=None,
font_scale=1.15,
figsize=None,
xlim=None,
ylim=None,
dpi=300,
loc='upper left',
marker_size=100,
markers=True,
):
"""Project genomes onto the PCs
:param pca: sklearn PCA object
:param X_scaled: obj from scaling data
:param fam_df: df of cazy family freqs
:param first_pc: int, number of the first PC
:param second_pc: int, number of the second PC
:param group_by: how to group/colour data, genus or species
OPTIONS
:param file_path: path to write out fig, if none no file saved
:param style: str, name of column to use to define style/marker style
:param style_order: list order to list styles
:param hue_order: list to write/assign categories of colours
:param font_scale: float, scale font. >1 increases font size
:param xlim: tuple, limits of the x axis
:param ylim: tuple, limits of the y axis
:param dpi: int, dpi to write out figure
:param loc: str, location of key
:param marker_size: int, scale of markers
:param markers: dict, pass dict to map each level style to a marker
defined in matplotlib
Return plot
"""
grouping = f"{group_by[0].upper()}{group_by[1:]}"
X_pca = pca.transform(X_scaled)
if figsize is not None:
plt.figure(figsize=figsize)
sns.set(font_scale=font_scale)
if hue_order is not None:
print('Applying hue order')
if style is not None:
print('Applying style')
if style_order is not None:
print('Applying style order')
# all options specified
# apply style order
g = sns.scatterplot(
x=X_pca[:,first_pc-1],
y=X_pca[:, second_pc-1],
data=fam_df,
hue=group_by,
s=marker_size,
hue_order=hue_order,
style=style,
style_order=style_order,
markers=markers,
)
else:
print('Not applying style order')
# use default style order
g = sns.scatterplot(
x=X_pca[:,first_pc-1],
y=X_pca[:, second_pc-1],
data=fam_df,
hue=group_by,
s=marker_size,
hue_order=hue_order,
style=style,
markers=markers,
)
else:
print('Not applying style')
# hue order only
g = sns.scatterplot(
x=X_pca[:,first_pc-1],
y=X_pca[:, second_pc-1],
data=fam_df,
hue=group_by,
s=marker_size,
hue_order=hue_order,
markers=markers,
)
else: # using default hue order - i.e. order data is presented in df
print('Not applying hue order')
if style is not None: # use different markers for catagroies in provided col
print('Applying style')
if style_order is not None: # define the order of the marker styles
print('Applying style order')
# apply style order
g = sns.scatterplot(
x=X_pca[:,first_pc-1],
y=X_pca[:, second_pc-1],
data=fam_df,
hue=group_by,
s=marker_size,
style=style,
style_order=style_order,
markers=markers,
)
else:
print('Not applying style order')
# use default style order
g = sns.scatterplot(
x=X_pca[:,first_pc-1],
y=X_pca[:, second_pc-1],
data=fam_df,
hue=group_by,
s=marker_size,
style=style,
markers=markers,
)
else:
print('Not Applying style')
# no options specified
# do not apply style
g = sns.scatterplot(
x=X_pca[:,first_pc-1],
y=X_pca[:, second_pc-1],
data=fam_df,
hue=group_by,
s=marker_size,
markers=markers,
)
if xlim is not None:
g.set(xlim=xlim);
if ylim is not None:
g.set(ylim=ylim);
g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);
plt.ylabel(f"PC{second_pc} {100 * pca.explained_variance_ratio_[(second_pc - 1)]:.2f}%");
plt.xlabel(f"PC{first_pc} {100 * pca.explained_variance_ratio_[(first_pc - 1)]:.2f}%");
plt.legend(bbox_to_anchor=(1.02, 1), loc=loc, borderaxespad=0);
sns.move_legend(g, "lower center", bbox_to_anchor=(.5, 1), ncol=3, title=None, frameon=False);
if file_path is not None:
plt.savefig(
file_path,
bbox_inches='tight',
dpi=dpi,
)
plt.show();
return plt
def plot_ie_loadings(
pca,
fam_df,
first_pc,
second_pc,
style=False,
threshold=0.7,
font_scale=1.15,
font_size=12,
dpi=300,
fig_size=(16,16),
file_path=None,
marker_size=100,
):
"""Build loadings plot
Modified from cazomevolve - styles points using intracellular/extracellular classification
:param pca: sklearn pca object
:param fam_df: cazy family frequncy df
:param first_pc: int, number of the first PC, e.g. PC1 == 1
:param second_pc: int, number of the second PC e.g. PC2 == 2
:param threshold: correlation cut off for showing labels
Only families with a value greater than the threshold
will be annotated
:param font_scale: scale font
:param font_size: font size of family labels
:param fig_size: tuple (width, height) of final plot
:param file_path: str, path to write out a figure.
If None, no figure is saved
Return nothing"""
sns.set(font_scale=font_scale)
# calculate loading = variables x loadings, returns an array
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
# get labels of variables, i.e. cazy families
loadings_labels = list(fam_df.columns)
try:
loadings_labels.remove('Species')
except (KeyError, ValueError):
pass
try:
loadings_labels.remove('Genus')
except (KeyError, ValueError):
pass
loadings_x = loadings[:,(first_pc-1)]
loadings_y = loadings[:,(second_pc-1)]
loadings_df = pd.DataFrame()
loadings_df['loadings_x'] = loadings_x
loadings_df['loadings_y'] = loadings_y
cazy_class = []
for lbl in loadings_labels:
if lbl.find('GH') != -1:
cazy_class.append('GH')
elif lbl.find('GT') != -1:
cazy_class.append('GT')
elif lbl.find('PL') != -1:
cazy_class.append('PL')
elif lbl.find('CE') != -1:
cazy_class.append('CE')
elif lbl.find('AA') != -1:
cazy_class.append('AA')
else:
cazy_class.append('CBM')
loadings_df['cazy_class'] = cazy_class
ie_classifications = []
for lbl in loadings_labels:
if lbl.startswith("i_"):
ie_classifications.append('Intracellular')
else:
ie_classifications.append('Extracellular')
loadings_df['ie_classification'] = ie_classifications
plt.figure(figsize=fig_size)
g = sns.scatterplot(
x=loadings_x,
y=loadings_y,
data=loadings_df,
hue=cazy_class,
s=marker_size,
style=ie_classifications,
);
g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);
g.set(xlim=(-1,1),ylim=(-1,1));
plt.ylabel(f"PC{second_pc}")
plt.xlabel(f"PC{first_pc}")
texts = [
plt.text(
xval,
yval,
lbl,
ha='center',
va='center',
fontsize=font_size,
) for (xval, yval, lbl) in zip(
loadings_x, loadings_y, loadings_labels
) if abs(xval) > threshold or abs(yval) > threshold
]
adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));
sns.move_legend(g, "lower center", bbox_to_anchor=(.5, 1), ncol=3, title=None, frameon=False);
if file_path is not None:
plt.savefig(file_path, dpi=dpi, bbox_inches='tight')
To plot the loadings, i.e. the degree of correlation between each CAZy family and each of the user selected PCs:
def plot_loadings(
pca,
fam_df,
first_pc,
second_pc,
style=False,
threshold=0.7,
font_scale=1.15,
font_size=12,
dpi=300,
fig_size=(16,16),
file_path=None,
marker_size=100,
):
"""Build loadings plot
:param pca: sklearn pca object
:param fam_df: cazy family frequncy df
:param first_pc: int, number of the first PC, e.g. PC1 == 1
:param second_pc: int, number of the second PC e.g. PC2 == 2
:param style: boolean, change shape of points depending on CAZy class
:param threshold: correlation cut off for showing labels
Only families with a value greater than the threshold
will be annotated
:param font_scale: scale font
:param font_size: font size of family labels
:param fig_size: tuple (width, height) of final plot
:param file_path: str, path to write out a figure.
If None, no figure is saved
Return nothing"""
sns.set(font_scale=font_scale)
# calculate loading = variables x loadings, returns an array
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
# get labels of variables, i.e. cazy families
loadings_labels = list(fam_df.columns)
try:
loadings_labels.remove('Species')
except (KeyError, ValueError):
pass
try:
loadings_labels.remove('Genus')
except (KeyError, ValueError):
pass
loadings_x = loadings[:,(first_pc-1)]
loadings_y = loadings[:,(second_pc-1)]
loadings_df = pd.DataFrame()
loadings_df['loadings_x'] = loadings_x
loadings_df['loadings_y'] = loadings_y
cazy_class = []
for lbl in loadings_labels:
if lbl.startswith('GH'):
cazy_class.append('GH')
elif lbl.startswith('GT'):
cazy_class.append('GT')
elif lbl.startswith('PL'):
cazy_class.append('PL')
elif lbl.startswith('CE'):
cazy_class.append('CE')
elif lbl.startswith('AA'):
cazy_class.append('AA')
else:
cazy_class.append('CBM')
loadings_df['cazy_class'] = cazy_class
plt.figure(figsize=fig_size)
if style:
g = sns.scatterplot(x=loadings_x, y=loadings_y, data=loadings_df, hue=cazy_class, s=marker_size, style=cazy_class);
else:
g = sns.scatterplot(x=loadings_x, y=loadings_y, data=loadings_df, hue=cazy_class, s=marker_size);
g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);
g.set(xlim=(-1,1),ylim=(-1,1));
plt.ylabel(f"PC{second_pc}")
plt.xlabel(f"PC{first_pc}")
texts = [
plt.text(
xval,
yval,
lbl,
ha='center',
va='center',
fontsize=font_size,
) for (xval, yval, lbl) in zip(
loadings_x, loadings_y, loadings_labels
) if abs(xval) > threshold or abs(yval) > threshold
]
adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));
sns.move_legend(g, "lower center", bbox_to_anchor=(.5, 1), ncol=3, title=None, frameon=False);
if file_path is not None:
plt.savefig(file_path, dpi=dpi, bbox_inches='tight')