goseq using pnas_expression_filtered.tsv

goseq is a Bioconductor package for performing a gene ontology analysis for RNA-seq and other length biased data. For more information on how to use the package, please refer to the vignette. For more information on the pnas_expression file please refer to my older post.

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("goseq")

library(goseq)
data <- read.table("pnas_expression_filtered.tsv",header=T,row.names=1)
nrow(data)
[1] 16494
head(data)
                lane1 lane2 lane3 lane4 lane5 lane6 lane8
ENSG00000124208   478   619   628   744   483   716   240
ENSG00000182463    27    20    27    26    48    55    24
ENSG00000124201   180   218   293   275   373   301    88
ENSG00000124207    76    80    85    97    80    81    37
ENSG00000125835   132   200   200   228   280   204    52
ENSG00000125834    42    60    72    86   131    99    30

d <- data[,1:7]
rownames(d) <- row.names(data)

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("edgeR")

library(edgeR)
group <- c(rep("Control",4),rep("Test",3))
d <- DGEList(counts = d, group=group)
dim(d)
[1] 16494     7
d <- calcNormFactors(d)
d
An object of class "DGEList"
$counts
                lane1 lane2 lane3 lane4 lane5 lane6 lane8
ENSG00000124208   478   619   628   744   483   716   240
ENSG00000182463    27    20    27    26    48    55    24
ENSG00000124201   180   218   293   275   373   301    88
ENSG00000124207    76    80    85    97    80    81    37
ENSG00000125835   132   200   200   228   280   204    52
16489 more rows ...

$samples
        group lib.size norm.factors
lane1 Control   976847    1.0296636
lane2 Control  1154746    1.0372521
lane3 Control  1439393    1.0362662
lane4 Control  1482652    1.0378383
lane5    Test  1820628    0.9537095
lane6    Test  1831553    0.9525624
lane8    Test   680798    0.9583181

d <- estimateCommonDisp(d)
d$common.dispersion
[1] 0.02001965
sqrt(d$common.dispersion)
[1] 0.1414908
de.com <- exactTest(d)

d <- estimateTagwiseDisp(d)
de.tgw <- exactTest(d)
summary(decideTestsDGE(de.tgw, p.value=0.01))
   [,1] 
-1  1404
0  13372
1   1718

genes=as.integer(p.adjust(de.tgw$table$PValue[de.tgw$table$logFC!=0],method="BH")<.01)
names(genes)=row.names(de.tgw$table[de.tgw$table$logFC!=0,])
table(genes)
genes
    0     1 
13372  3122
head(genes)
ENSG00000124208 ENSG00000182463 ENSG00000124201 ENSG00000124207 ENSG00000125835 
              0               0               0               0               0 
ENSG00000125834 
              0
pwf=nullp(genes,"hg19","ensGene")

The x-axis of the plot above are binned lengths of Ensembl genes and the y-axis is the ratio of differentially detected genes. We can see a clear bias in the detection of differential expression with longer genes.


GO.wall=goseq(pwf,"hg19","ensGene")
head(GO.wall)
       category over_represented_pvalue under_represented_pvalue
2269 GO:0005623            1.741568e-52                        1
9863 GO:0044464            1.741568e-52                        1
9830 GO:0044424            2.533098e-49                        1
2268 GO:0005622            2.088365e-48                        1
2238 GO:0005575            6.005558e-48                        1
2321 GO:0005737            2.845073e-47                        1

table(GO.wall$over_represented_pvalue<0.01)

FALSE  TRUE 
14092   844

GO.samp=goseq(pwf,"hg19","ensGene",method="Sampling",repcnt=1000)
head(GO.samp)
     category over_represented_pvalue under_represented_pvalue
36 GO:0000075             0.000999001                        1
41 GO:0000082             0.000999001                        1
46 GO:0000087             0.000999001                        1
67 GO:0000122             0.000999001                        1
77 GO:0000139             0.000999001                        1
92 GO:0000166             0.000999001                        1

table(GO.samp$over_represented_pvalue<0.05)

FALSE  TRUE 
13413  1523

head(GO.nobias)
       category over_represented_pvalue under_represented_pvalue
2269 GO:0005623            1.038052e-67                        1
9863 GO:0044464            1.038052e-67                        1
2238 GO:0005575            3.014341e-64                        1
9830 GO:0044424            3.255426e-60                        1
2268 GO:0005622            6.342680e-60                        1
3482 GO:0008150            1.916257e-59                        1

table(GO.nobias$over_represented_pvalue<0.01)

FALSE  TRUE 
13961   975

summary(GO.wall$category[p.adjust(GO.wall$over_represented_pvalue,method="BH")<.01])
   Length     Class      Mode 
      404 character character

library(GO.db)
for(go in wall.enriched.GO[1:5]){
   print(GOTERM[[go]])
   cat("--------------------------------------\n")
}

GOID: GO:0005623
Term: cell
Ontology: CC
Definition: The basic structural and functional unit of all organisms.
    Includes the plasma membrane and any external encapsulating
    structures such as the cell wall and cell envelope.
--------------------------------------
GOID: GO:0044464
Term: cell part
Ontology: CC
Definition: Any constituent part of a cell, the basic structural and
    functional unit of all organisms.
Synonym: protoplast
--------------------------------------
GOID: GO:0044424
Term: intracellular part
Ontology: CC
Definition: Any constituent part of the living contents of a cell; the
    matter contained within (but not including) the plasma membrane,
    usually taken to exclude large vacuoles and masses of secretory or
    ingested material. In eukaryotes it includes the nucleus and
    cytoplasm.
--------------------------------------
GOID: GO:0005622
Term: intracellular
Ontology: CC
Definition: The living contents of a cell; the matter contained within
    (but not including) the plasma membrane, usually taken to exclude
    large vacuoles and masses of secretory or ingested material. In
    eukaryotes it includes the nucleus and cytoplasm.
Synonym: internal to cell
Synonym: nucleocytoplasm
Synonym: protoplasm
Synonym: protoplast
--------------------------------------
GOID: GO:0005575
Term: cellular_component
Ontology: CC
Definition: The part of a cell or its extracellular environment in
    which a gene product is located. A gene product may be located in
    one or more parts of a cell and its location may be as specific as
    a particular macromolecular complex, that is, a stable, persistent
    association of macromolecules that function together.
Synonym: cellular component
Synonym: cellular component unknown
Synonym: GO:0008372
Secondary: GO:0008372
--------------------------------------

sessionInfo()
R version 2.15.2 (2012-10-26)
Platform: x86_64-w64-mingw32/x64 (64-bit)

locale:
[1] LC_COLLATE=English_Australia.1252  LC_CTYPE=English_Australia.1252   
[3] LC_MONETARY=English_Australia.1252 LC_NUMERIC=C                      
[5] LC_TIME=English_Australia.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] GO.db_2.8.0             org.Hs.eg.db_2.8.0      RSQLite_0.11.2         
 [4] DBI_0.2-5               AnnotationDbi_1.20.2    Biobase_2.18.0         
 [7] BiocGenerics_0.4.0      edgeR_3.0.2             limma_3.14.1           
[10] goseq_1.10.0            geneLenDataBase_0.99.10 BiasedUrn_1.04         
[13] BiocInstaller_1.8.3    

loaded via a namespace (and not attached):
 [1] biomaRt_2.14.0         Biostrings_2.26.2      bitops_1.0-4.2        
 [4] BSgenome_1.26.1        GenomicFeatures_1.10.0 GenomicRanges_1.10.5  
 [7] grid_2.15.2            IRanges_1.16.4         lattice_0.20-10       
[10] Matrix_1.0-10          mgcv_1.7-22            nlme_3.1-105          
[13] parallel_2.15.2        RCurl_1.95-3           Rsamtools_1.10.2      
[16] rtracklayer_1.18.0     stats4_2.15.2          tools_2.15.2          
[19] XML_3.95-0.1           zlibbioc_1.4.0

Bidirectional genes

Download 5' UTR for all RefSeq genes using the UCSC Table Browser.

Separate features according to strand

cat hg19_refgene_five_utr_110914.bed | perl -nle '@a = split; print if $a[5] eq "+";' > hg19_refgene_five_utr_110914_plus.bed
cat hg19_refgene_five_utr_110914.bed | perl -nle '@a = split; print if $a[5] eq "-";' > hg19_refgene_five_utr_110914_neg.bed

Use intersectBed to find overlapping features

#Force strandedness as a test, should have no output
intersectBed -s -a hg19_refgene_five_utr_110914_neg.bed -b hg19_refgene_five_utr_110914_plus.bed
intersectBed -wo -a hg19_refgene_five_utr_110914_neg.bed -b hg19_refgene_five_utr_110914_plus.bed > overlap
cat overlap | perl -nle '@a = split; $t{$a[3]} = '1'; $t{$a[9]} = '1'; END {print join("\n",keys %t)};' | cut -f1,2 -d'_' > unique

Performing a GO enrichment analysis on the unique list of bidirectional genes and using all the genes as the universe list:

R version 2.13.0 (2011-04-13)
Copyright (C) 2011 The R Foundation for Statistical Computing
ISBN 3-900051-07-0
Platform: x86_64-unknown-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> library("GO.db")
> library("GOstats")
> entrezUniverse=scan("universe2")
> selectedEntrezIds=scan("entrez")
> hgCutoff = 0.001
> #Biological Process
> params = new("GOHyperGParams",geneIds=selectedEntrezIds,universeGeneIds=entrezUniverse,ontology="BP",pvalueCutoff=hgCutoff,conditional=TRUE,testDirection="
over",annotation="org.Hs.eg.db")
> hgOver=hyperGTest(params)
> summary(hgOver)
       GOBPID       Pvalue OddsRatio    ExpCount Count Size
1  GO:0034641 2.845531e-06  1.675853 116.7780905   157 5140
2  GO:0090304 4.877145e-06  1.688666  90.8551719   128 3999
3  GO:0044260 5.050034e-05  1.551424 137.7708834   173 6064
4  GO:0022403 1.093607e-04  2.174322  16.4261789    33  723
5  GO:0016568 2.753699e-04  2.582012   8.3153271    20  366
6  GO:0006996 2.867515e-04  1.923650  22.5838527    40 1037
7  GO:0006281 3.059896e-04  2.631828   7.7473402    19  341
8  GO:0071841 3.514806e-04  1.571534  61.7969662    87 2720
9  GO:0044238 3.720664e-04  1.481915 184.6411559   215 8127
10 GO:0000075 3.912858e-04  3.099874   4.8619672    14  214
11 GO:0051329 4.542075e-04  2.618305   7.3611092    18  324
12 GO:0034622 4.721711e-04  2.353943   9.9965681    22  440
13 GO:0044248 4.733327e-04  1.759037  30.1941794    49 1329
14 GO:0006399 4.960020e-04  3.894081   2.7944952    10  123
15 GO:0006839 4.964491e-04  4.805971   1.8402773     8   81
16 GO:0010564 5.842741e-04  2.558462   7.5201455    18  331
17 GO:0007049 5.963847e-04  1.790438  26.5136248    44 1167
18 GO:0000387 6.138504e-04  8.383672   0.7043037     5   31
19 GO:0006368 7.234803e-04  5.192143   1.4994852     7   66
20 GO:0051276 7.455445e-04  2.080239  13.8588784    27  610
21 GO:0006974 8.086037e-04  3.376597   3.5085746    11  160
22 GO:0050434 8.996583e-04  5.955524   1.1359736     6   50
23 GO:0051028 9.584968e-04  3.873584   2.5218615     9  111
24 GO:0010467 9.885920e-04  1.455638  89.2420894   115 3928
                                                              Term
1                     cellular nitrogen compound metabolic process
2                                   nucleic acid metabolic process
3                         cellular macromolecule metabolic process
4                                                 cell cycle phase
5                                           chromatin modification
6                                           organelle organization
7                                                       DNA repair
8  cellular component organization or biogenesis at cellular level
9                                        primary metabolic process
10                                           cell cycle checkpoint
11                                interphase of mitotic cell cycle
12                        cellular macromolecular complex assembly
13                                      cellular catabolic process
14                                          tRNA metabolic process
15                                         mitochondrial transport
16                                regulation of cell cycle process
17                                                      cell cycle
18                                     spliceosomal snRNP assembly
19        transcription elongation from RNA polymerase II promoter
20                                         chromosome organization
21                                 response to DNA damage stimulus
22                      positive regulation of viral transcription
23                                                  mRNA transport
24                                                 gene expression
> #Molecular Function
> params = new("GOHyperGParams",geneIds=selectedEntrezIds,universeGeneIds=entrezUniverse,ontology="MF",pvalueCutoff=hgCutoff,conditional=TRUE,testDirection="over",annotation="org.Hs.eg.db")
> hgOver=hyperGTest(params)
> summary(hgOver)
      GOMFID       Pvalue OddsRatio    ExpCount Count Size
1 GO:0003676 0.0001312477  1.664829 53.28987613    79 2437
2 GO:0016206 0.0005217889       Inf  0.04574949     2    2
3 GO:0003723 0.0008795796  1.909603 18.43704529    33  806
4 GO:0050662 0.0009527279  3.105508  4.14032903    12  181
                                   Term
1                  nucleic acid binding
2 catechol O-methyltransferase activity
3                           RNA binding
4                      coenzyme binding
> #Cellular component
> params = new("GOHyperGParams",geneIds=selectedEntrezIds,universeGeneIds=entrezUniverse,ontology="CC",pvalueCutoff=hgCutoff,conditional=TRUE,testDirection="over",annotation="org.Hs.eg.db")
> hgOver=hyperGTest(params)
> summary(hgOver)
       GOCCID       Pvalue OddsRatio     ExpCount Count  Size
1  GO:0043227 8.095989e-15  2.357756 193.81366272   266  8649
2  GO:0043229 1.078643e-14  2.445344 215.07960860   285  9598
3  GO:0005622 1.268144e-13  2.727612 259.96442377   320 11601
4  GO:0031974 7.622271e-13  2.476594  50.44219618   102  2251
5  GO:0070013 1.297462e-12  2.479110  48.64949263    99  2171
6  GO:0005634 9.443707e-12  2.053406 120.73858420   183  5388
7  GO:0044422 3.286750e-11  2.013652 121.41084803   182  5418
8  GO:0005654 6.334903e-08  2.336166  28.48157768    59  1271
9  GO:0043228 4.001227e-06  1.784747  58.62140614    92  2616
10 GO:0005730 1.096997e-05  2.682435  11.33884996    28   506
11 GO:0005694 1.166634e-05  2.622132  12.01111380    29   536
12 GO:0043234 5.725214e-05  1.662474  59.30486691    88  2712
13 GO:0000151 6.952506e-05  4.226438   3.11482242    12   139
14 GO:0005739 8.158783e-05  1.881356  29.46756463    51  1315
15 GO:0005737 1.050799e-04  1.488375 182.27313361   218  8134
16 GO:0015630 3.948281e-04  2.107557  14.67776033    29   655
17 GO:0000775 4.285894e-04  3.660054   3.24927519    11   145
18 GO:0005684 5.008308e-04       Inf   0.04481759     2     2
19 GO:0080008 7.364930e-04 11.749319   0.42576709     4    19
                                 Term
1          membrane-bounded organelle
2             intracellular organelle
3                       intracellular
4             membrane-enclosed lumen
5       intracellular organelle lumen
6                             nucleus
7                      organelle part
8                         nucleoplasm
9      non-membrane-bounded organelle
10                          nucleolus
11                         chromosome
12                    protein complex
13           ubiquitin ligase complex
14                      mitochondrion
15                          cytoplasm
16           microtubule cytoskeleton
17     chromosome, centromeric region
18       U2-type spliceosomal complex
19 CUL4 RING ubiquitin ligase complex
> #KEGG
> params = new("KEGGHyperGParams",geneIds=selectedEntrezIds,universeGeneIds=entrezUniverse,pvalueCutoff=hgCutoff,testDirection="over",annotation="org.Hs.eg.db")
> hgOver=hyperGTest(params)
> summary(hgOver)
  KEGGID       Pvalue OddsRatio ExpCount Count Size          Term
1  03013 0.0003025115  3.918123 3.174733    11  152 RNA transport
>

Although this was a brief analysis, the results are somewhat similar to the findings in the paper Trinklein et al., 2004 (An Abundance of Bidirectional Promoters in the Human Genome). Findings from the paper include:

1. DNA-repair genes are more than fivefold overrepresented in the bidirectional class.
2. Chaperone proteins are almost threefold overrepresented
3. Mitochondrial genes are more than twofold overrepresented

Extract gene names according to GO terms

Find genes that are associated with a known gene ontology term:

#install if you don't have org.Hs.eg.db and GO.db
source("http://bioconductor.org/biocLite.R")
biocLite("org.Hs.eg.db")
biocLite("GO.db")
library(org.Hs.eg.db)
library(GO.db)

go_id = GOID( GOTERM[ Term(GOTERM) == "chromatin remodeling"])

get(go_id, org.Hs.egGO2ALLEGS)

     IEA      IEA      IDA      IEA      IEA      IDA      TAS       IC
    "86"    "473"    "664"    "676"   "1105"   "2186"   "2648"   "3065"
      IC      ISS      IEA      ISS      TAS      IDA      IEA      TAS
  "3066"   "3070"   "4221"   "4678"   "5925"   "5928"   "6594"   "6595"
     IDA      IEA      IDA      IDA      IMP      TAS      NAS      NAS
  "6597"   "6598"   "6599"   "6601"   "6602"   "6827"   "6829"   "6830"
     IEA      IDA      TAS      IEA      TAS      IEA      NAS      IEA
  "6927"   "7141"   "7141"   "7270"   "8289"   "8467"   "8850"   "9031"
     IDA      NAS      IDA      TAS      IDA      TAS      IEA      NAS
  "9557"   "9757"   "9759"  "10014"  "10361"  "10629"  "10661"  "11176"
     NAS      IMP      IEA      IDA      IMP      NAS      IDA      NAS
 "11335"  "22893"  "23314"  "23411"  "50511"  "50943"  "51773"  "54108"
     IDA      TAS      IMP      NAS      IMP      IDA      NAS      IEA
 "54617"  "55193"  "55355"  "57492"  "57680"  "79723"  "84181" "150572"
     IMP      NAS
"201161" "373861"

allegs = get(go_id, org.Hs.egGO2ALLEGS)

genes = unlist(mget(allegs,org.Hs.egSYMBOL))

genes
       86       473       664       676      1105      2186      2648      3065
 "ACTL6A"    "RERE"   "BNIP3"    "BRDT"    "CHD1"    "BPTF"   "KAT2A"   "HDAC1"
     3066      3070      4221      4678      5925      5928      6594      6595
  "HDAC2"   "HELLS"    "MEN1"    "NASP"     "RB1"   "RBBP4" "SMARCA1" "SMARCA2"
     6597      6598      6599      6601      6602      6827      6829      6830
"SMARCA4" "SMARCB1" "SMARCC1" "SMARCC2" "SMARCD1" "SUPT4H1"  "SUPT5H"  "SUPT6H"
     6927      7141      7141      7270      8289      8467      8850      9031
  "HNF1A"    "TNP1"    "TNP1"    "TTF1"  "ARID1A" "SMARCA5"   "KAT2B"   "BAZ1B"
     9557      9757      9759     10014     10361     10629     10661     11176
  "CHD1L"    "MLL4"   "HDAC4"   "HDAC5"    "NPM2"   "TAF6L"    "KLF1"   "BAZ2A"
    11335     22893     23314     23411     50511     50943     51773     54108
   "CBX3"   "BAHD1"   "SATB2"   "SIRT1"   "SYCP3"   "FOXP3"    "RSF1"  "CHRAC1"
    54617     55193     55355     57492     57680     79723     84181    150572
  "INO80"   "PBRM1"   "HJURP"  "ARID1B"    "CHD8" "SUV39H2"    "CHD6"   "SMYD1"
   201161    373861
  "CENPV"   "HILS1"

See also: org.Hs.eg.db

Source: bioconductor mailing list thread -> [BioC] Query Gene Ontology

Of the RefSeq’s that have CpG islands 1,000 bp upstream, what are the GO terms - part 2?

Using GO.db and GOstats, I obtained the gene list with bona fide CpG islands upstream and conducted a GO enrichment analysis. The choice of the gene universe is again all RefSeq gene models. Enriched Biological Processes include:

1 primary metabolic process
2 branched chain family amino acid metabolic process
3 regulation of nucleobase, nucleoside, nucleotide and nucleic acid metabolic process
4 regulation of transcription, DNA-dependent
5 nitrogen compound metabolic process
6 cellular metabolic process
7 mitotic cell cycle checkpoint
8 nucleobase, nucleoside and nucleotide metabolic process
9 cAMP biosynthetic process
10 G-protein signaling, coupled to cAMP nucleotide second messenger
11 chromatin organization
12 cellular biopolymer metabolic process

Molecular functions:

1 sequence-specific DNA binding
2 transcription factor activity

Cellular components:

Term
1 nucleus
2 membrane-bounded organelle
3 chromosomal part
4 cell
5 intracellular part

Transcription factors more likely to have CpG islands 1,000 bp upstream, and not overlapping the 5' UTR?

Gene Ontology enrichment analysis

Updated: 2013 November 23rd

There are many tools available for conducting a gene ontology enrichment analysis. Some online tools that I've heard of are DAVID, PANTHER and GOrilla but I have never actually used them. Within Bioconductor there's GOstats, topGO and goseq (and probably some more). Here I'll test out GOstats and topGO.

Firstly download and install the packages if you haven't already:

source("http://bioconductor.org/biocLite.R")
#A set of annotation maps describing the entire Gene Ontology
biocLite("GO.db")
biocLite("topGO")
biocLite("GOstats")

Now let's create a toy example where my test set of genes are those that are associated with the GO term GO:0007411 (axon guidance). Let's fetch all these genes:

#install if you haven't already
source("http://bioconductor.org/biocLite.R")
biocLite("org.Hs.eg.db")
biocLite("GO.db")
library(org.Hs.eg.db)
library(GO.db)

#for reference's sake
#how to get GO terms from an Entrez gene id following the manual
#http://bioconductor.org/packages/release/data/annotation/manuals/org.Hs.eg.db/man/org.Hs.eg.db.pdf

#org.Hs.egGO is an R object that provides
#mappings between entrez gene identifers and the GO
#identifers that they are directly associated with

entrez_object <- org.Hs.egGO
# Get the entrez gene identifiers that are mapped to a GO ID
mapped_genes <- mappedkeys(entrez_object)
# Convert to a list
entrez_to_go <- as.list(entrez_object[mapped_genes])
#http://www.ncbi.nlm.nih.gov/gene/?term=1
#entrez gene id 1 is A1BG
entrez_to_go[[1]]
$`GO:0008150`
$`GO:0008150`$GOID
[1] "GO:0008150"

$`GO:0008150`$Evidence
[1] "ND"

$`GO:0008150`$Ontology
[1] "BP"


$`GO:0005576`
$`GO:0005576`$GOID
[1] "GO:0005576"

$`GO:0005576`$Evidence
[1] "IDA"

$`GO:0005576`$Ontology
[1] "CC"


$`GO:0003674`
$`GO:0003674`$GOID
[1] "GO:0003674"

$`GO:0003674`$Evidence
[1] "ND"

$`GO:0003674`$Ontology
[1] "MF"

#map GO terms to Entrez gene ids

go_object <- as.list(org.Hs.egGO2EG)
#test
go_object[1]
$`GO:0000002`
    TAS     TAS     ISS     IMP     NAS     IMP     IMP
  "291"  "1890"  "4205"  "4358"  "9361" "10000" "92667"
#same
go_object['GO:0000002']
$`GO:0000002`
    TAS     TAS     ISS     IMP     NAS     IMP     IMP
  "291"  "1890"  "4205"  "4358"  "9361" "10000" "92667"

#GO:0000002 -> mitochondrial genome maintenance
#entrez gene id 291 -> SLC25A4
#which if you look up has the GO term mitochondrial genome maintenance

#now let's get all the entrez gene ids for
#GO:0007411 (axon guidance)

axon_gene <- go_object['GO:0007411']
length(unlist(axon_gene, use.names=F))
[1] 330

length(unique(unlist(axon_gene, use.names=F)))
[1] 319
axon_gene <- unique(unlist(axon_gene, use.names=F))
head(axon_gene)
[1] "25"  "27"  "60"  "71"  "160" "161"

Now let's perform the enrichment analysis:

library("GO.db")
library("GOstats")

#as the universal list, I will use all the genes with GO terms

universe <- mapped_genes
length(axon_gene)
[1] 319
length(universe)
[1] 18105

params <- new('GOHyperGParams',
              geneIds=axon_gene,
              universeGeneIds=universe,
              ontology='BP',
              pvalueCutoff=0.001,
              conditional=F,
              testDirection='over',
              annotation="org.Hs.eg.db"
             )
hgOver <- hyperGTest(params)

hgOver
Gene to GO BP  test for over-representation 
4207 GO BP ids tested (997 have p < 0.001)
Selected gene set size: 319 
    Gene universe size: 14844 
    Annotation package: org.Hs.eg

result <- summary(hgOver)
head(result,20)
       GOBPID Pvalue OddsRatio  ExpCount Count Size
1  GO:0000902      0       Inf 21.060361   319  980
2  GO:0000904      0       Inf 15.215036   319  708
3  GO:0006928      0       Inf 30.945837   319 1440
4  GO:0006935      0       Inf 13.173471   319  613
5  GO:0007409      0       Inf 10.938494   319  509
6  GO:0007411      0       Inf  7.757949   319  361
7  GO:0009605      0       Inf 29.226624   319 1360
8  GO:0022008      0       Inf 25.723727   319 1197
9  GO:0030030      0       Inf 21.060361   319  980
10 GO:0030182      0       Inf 22.414241   319 1043
11 GO:0031175      0       Inf 15.601859   319  726
12 GO:0032989      0       Inf 22.435732   319 1044
13 GO:0032990      0       Inf 15.386958   319  716
14 GO:0040011      0       Inf 28.001684   319 1303
15 GO:0042330      0       Inf 13.173471   319  613
16 GO:0048468      0       Inf 33.847009   319 1575
17 GO:0048666      0       Inf 18.030248   319  839
18 GO:0048667      0       Inf 12.163433   319  566
19 GO:0048699      0       Inf 24.240905   319 1128
20 GO:0048812      0       Inf 12.378335   319  576
                                                    Term
1                                     cell morphogenesis
2         cell morphogenesis involved in differentiation
3                            cellular component movement
4                                             chemotaxis
5                                           axonogenesis
6                                          axon guidance
7                          response to external stimulus
8                                           neurogenesis
9                           cell projection organization
10                                neuron differentiation
11                         neuron projection development
12                      cellular component morphogenesis
13                               cell part morphogenesis
14                                            locomotion
15                                                 taxis
16                                      cell development
17                                    neuron development
18 cell morphogenesis involved in neuron differentiation
19                                 generation of neurons
20                       neuron projection morphogenesis

So as expected we get a bunch of GO terms associated to neurons and of our original GO term, axon guidance.

What if my gene list are not Entrez gene ids?

Use biomaRt. For this example I will show how you can go from Ensembl gene IDs to Entrez gene IDs:

#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("biomaRt")
 
library("biomaRt")
ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")

my_chr <- c(1:22, 'M', 'X', 'Y')
my_ensembl_gene <- getBM(attributes='ensembl_gene_id',
                    filters = 'chromosome_name',
                    values = my_chr,
                    mart = ensembl)

#how many entries
length(my_ensembl_gene)
[1] 52322

five_ensembl <- my_ensembl_gene[1:5,]

five_ensembl_to_entrez <- getBM(attributes=c('ensembl_gene_id', 'entrezgene'), filters = 'ensembl_gene_id', values = five_ensembl, mart = ensembl)
five_ensembl_to_entrez
  ensembl_gene_id entrezgene
1 ENSG00000036549      26009
2 ENSG00000037637      54455
3 ENSG00000221126         NA
4 ENSG00000225011         NA
5 ENSG00000228176         NA

#out of interest how many entrez gene ids?
my_entrez_gene <- getBM(attributes='entrezgene',
                    filters = 'chromosome_name',
                    values = my_chr,
                    mart = ensembl)

length(my_entrez_gene[,1])
[1] 24231

#get some more info on the entrez_gene
my_attribute <- c('entrezgene',
                  'hgnc_symbol',
                  'chromosome_name',
                  'start_position',
                  'end_position',
                  'strand')

my_entrez_gene_info  <- getBM(attributes=my_attribute,
                        filters = c('entrezgene', 'chromosome_name'),
                        values = list(entrezgene=my_entrez_gene$entrezgene, chromosome_name=my_chr),
                        mart = ensembl)

head(my_entrez_gene_info)
  entrezgene hgnc_symbol chromosome_name start_position end_position strand
1     266919   LINC00307              21       31581469     31584101     -1
2       1652         DDT              22       24313554     24322660     -1
3  100422891     MIR4327              21       31747612     31747696     -1
4     115761       ARL11              13       50202435     50208008      1
5      64863      METTL4              18        2537524      2571508     -1
6       3704        ITPA              20        3189514      3204516      1
dim(my_entrez_gene_info)
[1] 26232     6

#entrez gene id on different chromosome locations
my_entrez_gene_info[my_entrez_gene_info$entrezgene=='728369',]
    entrezgene hgnc_symbol chromosome_name start_position end_position strand
306     728369    USP17L24               4        9326891      9328483      1
316     728369    USP17L25               4        9331637      9333229      1
327     728369    USP17L26               4        9336384      9337976      1
337     728369    USP17L27               4        9345874      9347466      1
345     728369    USP17L28               4        9350619      9352211      1
353     728369    USP17L29               4        9355364      9356956      1
363     728369    USP17L30               4        9364855      9366447      1

To learn more about the missing Entrez ID values from the Ensembl conversion see this useful post on BioStars and my post on converting gene identifiers.

Further reading

org.Hs.eg.db reference manual

GOstats vignette.