Load the dplyr package.
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# install.packages("dplyr")
Load our altmetrics data.
counts_raw <- read.delim("data/counts-raw.txt.gz")
Only investigate articles that are primary reserarch articles.
research <- filter(counts_raw, articleType == "Research Article")
dim(research)
## [1] 21096 32
What about hte year 2006?
research_2006 <- filter(research, year == 2006)
dim(research_2006)
## [1] 873 32
Articles in 2006 that have a facebook commment or a tweet.
research_2006_fb_tweet <- filter(research, year == 2006,
backtweetsCount > 0 |
facebookCommentCount > 0)
dim(research_2006_fb_tweet)
## [1] 13 32
How to select columns?
Obtain article information:
article_info <- select(counts_raw, doi, pubDate, journal,
title, articleType, authorsCount)
colnames(article_info)
## [1] "doi" "pubDate" "journal" "title"
## [5] "articleType" "authorsCount"
article_info <- select(counts_raw, doi:authorsCount)
colnames(article_info)
## [1] "doi" "pubDate" "journal" "title"
## [5] "articleType" "authorsCount"
Obtain metrics:
metrics <- select(counts_raw, contains("Count"),
-authorsCount, -contains("facebook"))
colnames(metrics)
## [1] "backtweetsCount" "deliciousCount"
## [3] "mendeleyReadersCount" "almBlogsCount"
## [5] "pdfDownloadsCount" "xmlDownloadsCount"
## [7] "htmlDownloadsCount" "almCiteULikeCount"
## [9] "almScopusCount" "almPubMedCentralCount"
## [11] "almCrossRefCount" "plosCommentCount"
## [13] "plosCommentResponsesCount" "wosCountThru2010"
## [15] "wosCountThru2011"
dim(metrics)
## [1] 24331 15
In the terminal, the pipe command is | In dplyr, the pipe commmad is %>%
Obtain articles from 2006 but only the facebook data.
facebook_2006 <- research %>% filter(year == 2006) %>%
select(contains("facebook"))
dim(facebook_2006)
## [1] 873 4
colnames(facebook_2006)
## [1] "facebookShareCount" "facebookLikeCount" "facebookCommentCount"
## [4] "facebookClickCount"
Confirming results as you build a dplyr chain of commands:
research %>% filter(year == 2006) %>%
select(contains("facebook")) %>% head
## facebookShareCount facebookLikeCount facebookCommentCount
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## facebookClickCount
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
How to sort the data frame? Use the function arrange.
research %>% arrange(desc(authorsCount)) %>%
select(authorsCount) %>% head
## authorsCount
## 1 158
## 2 144
## 3 120
## 4 117
## 5 114
## 6 82
# No more NA's after filtering for reserach articles
anyNA(research$authorsCount)
## [1] FALSE
research[1:10, 1:5]
## doi pubDate journal
## 1 10.1371/journal.pbio.0000001 2003-10-13 pbio
## 2 10.1371/journal.pbio.0000002 2003-11-17 pbio
## 3 10.1371/journal.pbio.0000005 2003-08-18 pbio
## 4 10.1371/journal.pbio.0000006 2003-08-18 pbio
## 5 10.1371/journal.pbio.0000010 2003-10-13 pbio
## 6 10.1371/journal.pbio.0000012 2003-10-13 pbio
## 7 10.1371/journal.pbio.0000013 2003-09-15 pbio
## 8 10.1371/journal.pbio.0000019 2003-09-15 pbio
## 9 10.1371/journal.pbio.0000020 2003-10-13 pbio
## 10 10.1371/journal.pbio.0000021 2003-10-13 pbio
## title
## 1 A Functional Analysis of the Spacer of V(D)J Recombination Signal Sequences
## 2 Viral Discovery and Sequence Recovery Using DNA Microarrays
## 3 The Transcriptome of the Intraerythrocytic Developmental Cycle of Plasmodium falciparum
## 4 DNA Analysis Indicates That Asian Elephants Are Native to Borneo and Are Therefore a High Priority for Conservation
## 5 The Roles of APC and Axin Derived from Experimental and Theoretical Analysis of the Wnt Pathway
## 6 Genome-Wide RNAi of C. elegans Using the Hypersensitive rrf-3 Strain Reveals Novel Gene Functions
## 7 Drosophila Free-Running Rhythms Require Intercellular Communication
## 8 From Gene Trees to Organismal Phylogeny in Prokaryotes:The Case of the γ-Proteobacteria
## 9 Candidate Gene Association Study in Type 2 Diabetes Indicates a Role for Genes Involved in β-Cell Function as Well as Insulin Action
## 10 Developmental Origin and Evolution of Bacteriocytes in the Aphid–Buchnera Symbiosis
## articleType
## 1 Research Article
## 2 Research Article
## 3 Research Article
## 4 Research Article
## 5 Research Article
## 6 Research Article
## 7 Research Article
## 8 Research Article
## 9 Research Article
## 10 Research Article
research %>% slice(1:10) %>% select(1:5)
## doi pubDate journal
## 1 10.1371/journal.pbio.0000001 2003-10-13 pbio
## 2 10.1371/journal.pbio.0000002 2003-11-17 pbio
## 3 10.1371/journal.pbio.0000005 2003-08-18 pbio
## 4 10.1371/journal.pbio.0000006 2003-08-18 pbio
## 5 10.1371/journal.pbio.0000010 2003-10-13 pbio
## 6 10.1371/journal.pbio.0000012 2003-10-13 pbio
## 7 10.1371/journal.pbio.0000013 2003-09-15 pbio
## 8 10.1371/journal.pbio.0000019 2003-09-15 pbio
## 9 10.1371/journal.pbio.0000020 2003-10-13 pbio
## 10 10.1371/journal.pbio.0000021 2003-10-13 pbio
## title
## 1 A Functional Analysis of the Spacer of V(D)J Recombination Signal Sequences
## 2 Viral Discovery and Sequence Recovery Using DNA Microarrays
## 3 The Transcriptome of the Intraerythrocytic Developmental Cycle of Plasmodium falciparum
## 4 DNA Analysis Indicates That Asian Elephants Are Native to Borneo and Are Therefore a High Priority for Conservation
## 5 The Roles of APC and Axin Derived from Experimental and Theoretical Analysis of the Wnt Pathway
## 6 Genome-Wide RNAi of C. elegans Using the Hypersensitive rrf-3 Strain Reveals Novel Gene Functions
## 7 Drosophila Free-Running Rhythms Require Intercellular Communication
## 8 From Gene Trees to Organismal Phylogeny in Prokaryotes:The Case of the γ-Proteobacteria
## 9 Candidate Gene Association Study in Type 2 Diabetes Indicates a Role for Genes Involved in β-Cell Function as Well as Insulin Action
## 10 Developmental Origin and Evolution of Bacteriocytes in the Aphid–Buchnera Symbiosis
## articleType
## 1 Research Article
## 2 Research Article
## 3 Research Article
## 4 Research Article
## 5 Research Article
## 6 Research Article
## 7 Research Article
## 8 Research Article
## 9 Research Article
## 10 Research Article
Challenges:
Using a chain of pipes, output the titles of the three research articles with the largest 2011 citation count (wosCountThru2011).
research %>% arrange(desc(wosCountThru2011)) %>%
slice(1:3) %>% select(title)
## title
## 1 Relaxed Phylogenetics and Dating with Confidence
## 2 Human MicroRNA Targets
## 3 Projections of Global Mortality and Burden of Disease from 2002 to 2030
Using a chain of pipes, output the author count (authorsCount), title, journal, and subject tags (plosSubjectTags) of the three research articles with the largest number of authors.
research %>% arrange(desc(authorsCount)) %>%
slice(1:3) %>%
select(authorsCount, title, journal, plosSubjectTags)
## authorsCount
## 1 158
## 2 144
## 3 120
## title
## 1 Integrative Annotation of 21,037 Human Genes Validated by Full-Length cDNA Clones
## 2 Genome-Wide Association Scan Meta-Analysis Identifies Three Loci Influencing Adiposity and Fat Distribution
## 3 Common Genetic Variants and Modification of Penetrance of BRCA2-Associated Breast Cancer
## journal plosSubjectTags
## 1 pbio Computational Biology|Genetics and Genomics
## 2 pgen Diabetes and Endocrinology|Genetics and Genomics
## 3 pgen Genetics and Genomics
How to create new columns? Use the funciton mutate.
research <- research %>%
mutate(weeksSincePublished = daysSincePublished / 7,
yearsSincePublished = weeksSincePublished / 52)
Summarize the data with summarize.
research %>% summarize(plos_mean = mean(plosCommentCount),
plos_sd = sd(plosCommentCount),
num = n(),
plos_sem = plos_sd / sqrt(num))
## plos_mean plos_sd num plos_sem
## 1 0.2642681 1.230676 21096 0.008473125
research %>%
group_by(journal, year) %>%
summarize(plos_mean = mean(plosCommentCount),
plos_sd = sd(plosCommentCount),
num = n(),
plos_sem = plos_sd / sqrt(num))
## Source: local data frame [42 x 6]
## Groups: journal [?]
##
## journal year plos_mean plos_sd num plos_sem
## <fctr> <int> <dbl> <dbl> <int> <dbl>
## 1 pbio 2003 0.00000000 0.0000000 33 0.00000000
## 2 pbio 2004 0.03448276 0.1829922 174 0.01387260
## 3 pbio 2005 0.06818182 0.3133398 176 0.02361887
## 4 pbio 2006 0.10326087 0.4375668 184 0.03225785
## 5 pbio 2007 0.15270936 0.4988403 203 0.03501172
## 6 pbio 2008 0.20304569 0.6055785 197 0.04314568
## 7 pbio 2009 0.23204420 0.7826385 181 0.05817306
## 8 pbio 2010 0.16384181 0.6836092 177 0.05138321
## 9 pcbi 2005 0.09090909 0.3481553 55 0.04694525
## 10 pcbi 2006 0.06617647 0.2776143 136 0.02380523
## .. ... ... ... ... ... ...