In this notebook, we use the group-based DISCOVER test to detect mutual exclusivity in predefined gene sets. These gene sets are extracted from the canonical pathway database of MSigDb.
import sys
sys.path.append("../lib")
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy
import discover
import nbsupport.io
import nbsupport.plots
from nbsupport.stats import fdr
dataFile = "../data/tcga/tcga-pancan12.h5"
events = nbsupport.io.load_discover_matrix(dataFile, "/models/combined")
genes = numpy.char.partition(events.rownames.astype(str), "_")[:, 0]
def readGmt(filename):
with open(filename) as stream:
return { line[0] : line[2:] for line in map(str.split, stream) }
def filterGeneSets(geneSets, genes):
return {
name : numpy.intersect1d(genes, geneSet)
for name, geneSet
in geneSets.iteritems()
if len(numpy.intersect1d(genes, geneSet)) > 2 }
pathwayGeneSets = readGmt("../data/msigdb/c2.cp.v5.0.symbols.gmt")
pathwayGeneSets = filterGeneSets(pathwayGeneSets, genes)
selectedPathways = {
name : genes
for name, genes
in pathwayGeneSets.iteritems()
if name.startswith("ST_") or name.startswith("SA_")}
pValues = numpy.array([
discover.groupwise_discover_test(events[numpy.in1d(genes, pathwayGenes)], "impurity")
for pathwayGenes in selectedPathways.itervalues()])
with matplotlib.rc_context({"font.family": "Arial"}):
for p, q, (name, geneSet) in zip(pValues, fdr(pValues), selectedPathways.iteritems()):
if q < 0.05:
x = events[numpy.in1d(genes, geneSet)]
nbsupport.plots.event_plot(x)
p_formatted = ("%.2g" % p).replace("e", "\\times 10^{") + "}"
plt.title("%s (P = $%s$)" % (name, p_formatted))
plt.show()