Load the dplyr package.

library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# install.packages("dplyr")

Load our altmetrics data.

counts_raw <- read.delim("data/counts-raw.txt.gz")

Only investigate articles that are primary reserarch articles.

research <- filter(counts_raw, articleType == "Research Article")
dim(research)
## [1] 21096    32

What about hte year 2006?

research_2006 <- filter(research, year == 2006)
dim(research_2006)
## [1] 873  32

Articles in 2006 that have a facebook commment or a tweet.

research_2006_fb_tweet <- filter(research, year == 2006,
                                 backtweetsCount > 0 |
                                 facebookCommentCount > 0)
dim(research_2006_fb_tweet)
## [1] 13 32

How to select columns?

Obtain article information:

article_info <- select(counts_raw, doi, pubDate, journal,
                       title, articleType, authorsCount)
colnames(article_info)
## [1] "doi"          "pubDate"      "journal"      "title"       
## [5] "articleType"  "authorsCount"
article_info <- select(counts_raw, doi:authorsCount)
colnames(article_info)
## [1] "doi"          "pubDate"      "journal"      "title"       
## [5] "articleType"  "authorsCount"

Obtain metrics:

metrics <- select(counts_raw, contains("Count"),
                  -authorsCount, -contains("facebook"))
colnames(metrics)
##  [1] "backtweetsCount"           "deliciousCount"           
##  [3] "mendeleyReadersCount"      "almBlogsCount"            
##  [5] "pdfDownloadsCount"         "xmlDownloadsCount"        
##  [7] "htmlDownloadsCount"        "almCiteULikeCount"        
##  [9] "almScopusCount"            "almPubMedCentralCount"    
## [11] "almCrossRefCount"          "plosCommentCount"         
## [13] "plosCommentResponsesCount" "wosCountThru2010"         
## [15] "wosCountThru2011"
dim(metrics)
## [1] 24331    15

In the terminal, the pipe command is | In dplyr, the pipe commmad is %>%

Obtain articles from 2006 but only the facebook data.

facebook_2006 <- research %>% filter(year == 2006) %>%
  select(contains("facebook"))
dim(facebook_2006)
## [1] 873   4
colnames(facebook_2006)
## [1] "facebookShareCount"   "facebookLikeCount"    "facebookCommentCount"
## [4] "facebookClickCount"

Confirming results as you build a dplyr chain of commands:

research %>% filter(year == 2006) %>%
  select(contains("facebook")) %>% head
##   facebookShareCount facebookLikeCount facebookCommentCount
## 1                  0                 0                    0
## 2                  0                 0                    0
## 3                  0                 0                    0
## 4                  0                 0                    0
## 5                  0                 0                    0
## 6                  0                 0                    0
##   facebookClickCount
## 1                  0
## 2                  0
## 3                  0
## 4                  0
## 5                  0
## 6                  0

How to sort the data frame? Use the function arrange.

research %>% arrange(desc(authorsCount)) %>%
  select(authorsCount) %>% head
##   authorsCount
## 1          158
## 2          144
## 3          120
## 4          117
## 5          114
## 6           82
# No more NA's after filtering for reserach articles
anyNA(research$authorsCount)
## [1] FALSE
research[1:10, 1:5]
##                             doi    pubDate journal
## 1  10.1371/journal.pbio.0000001 2003-10-13    pbio
## 2  10.1371/journal.pbio.0000002 2003-11-17    pbio
## 3  10.1371/journal.pbio.0000005 2003-08-18    pbio
## 4  10.1371/journal.pbio.0000006 2003-08-18    pbio
## 5  10.1371/journal.pbio.0000010 2003-10-13    pbio
## 6  10.1371/journal.pbio.0000012 2003-10-13    pbio
## 7  10.1371/journal.pbio.0000013 2003-09-15    pbio
## 8  10.1371/journal.pbio.0000019 2003-09-15    pbio
## 9  10.1371/journal.pbio.0000020 2003-10-13    pbio
## 10 10.1371/journal.pbio.0000021 2003-10-13    pbio
##                                                                                                                                   title
## 1                                                           A Functional Analysis of the Spacer of V(D)J Recombination Signal Sequences
## 2                                                                           Viral Discovery and Sequence Recovery Using DNA Microarrays
## 3                                               The Transcriptome of the Intraerythrocytic Developmental Cycle of Plasmodium falciparum
## 4                   DNA Analysis Indicates That Asian Elephants Are Native to Borneo and Are Therefore a High Priority for Conservation
## 5                                       The Roles of APC and Axin Derived from Experimental and Theoretical Analysis of the Wnt Pathway
## 6                                     Genome-Wide RNAi of C. elegans Using the Hypersensitive rrf-3 Strain Reveals Novel Gene Functions
## 7                                                                   Drosophila Free-Running Rhythms Require Intercellular Communication
## 8                                               From Gene Trees to Organismal Phylogeny in Prokaryotes:The Case of the γ-Proteobacteria
## 9  Candidate Gene Association Study in Type 2 Diabetes Indicates a Role for Genes Involved in β-Cell Function as Well as Insulin Action
## 10                                                  Developmental Origin and Evolution of Bacteriocytes in the Aphid–Buchnera Symbiosis
##         articleType
## 1  Research Article
## 2  Research Article
## 3  Research Article
## 4  Research Article
## 5  Research Article
## 6  Research Article
## 7  Research Article
## 8  Research Article
## 9  Research Article
## 10 Research Article
research %>% slice(1:10) %>% select(1:5)
##                             doi    pubDate journal
## 1  10.1371/journal.pbio.0000001 2003-10-13    pbio
## 2  10.1371/journal.pbio.0000002 2003-11-17    pbio
## 3  10.1371/journal.pbio.0000005 2003-08-18    pbio
## 4  10.1371/journal.pbio.0000006 2003-08-18    pbio
## 5  10.1371/journal.pbio.0000010 2003-10-13    pbio
## 6  10.1371/journal.pbio.0000012 2003-10-13    pbio
## 7  10.1371/journal.pbio.0000013 2003-09-15    pbio
## 8  10.1371/journal.pbio.0000019 2003-09-15    pbio
## 9  10.1371/journal.pbio.0000020 2003-10-13    pbio
## 10 10.1371/journal.pbio.0000021 2003-10-13    pbio
##                                                                                                                                   title
## 1                                                           A Functional Analysis of the Spacer of V(D)J Recombination Signal Sequences
## 2                                                                           Viral Discovery and Sequence Recovery Using DNA Microarrays
## 3                                               The Transcriptome of the Intraerythrocytic Developmental Cycle of Plasmodium falciparum
## 4                   DNA Analysis Indicates That Asian Elephants Are Native to Borneo and Are Therefore a High Priority for Conservation
## 5                                       The Roles of APC and Axin Derived from Experimental and Theoretical Analysis of the Wnt Pathway
## 6                                     Genome-Wide RNAi of C. elegans Using the Hypersensitive rrf-3 Strain Reveals Novel Gene Functions
## 7                                                                   Drosophila Free-Running Rhythms Require Intercellular Communication
## 8                                               From Gene Trees to Organismal Phylogeny in Prokaryotes:The Case of the γ-Proteobacteria
## 9  Candidate Gene Association Study in Type 2 Diabetes Indicates a Role for Genes Involved in β-Cell Function as Well as Insulin Action
## 10                                                  Developmental Origin and Evolution of Bacteriocytes in the Aphid–Buchnera Symbiosis
##         articleType
## 1  Research Article
## 2  Research Article
## 3  Research Article
## 4  Research Article
## 5  Research Article
## 6  Research Article
## 7  Research Article
## 8  Research Article
## 9  Research Article
## 10 Research Article

Challenges:

Using a chain of pipes, output the titles of the three research articles with the largest 2011 citation count (wosCountThru2011).

research %>% arrange(desc(wosCountThru2011)) %>%
  slice(1:3) %>% select(title)
##                                                                     title
## 1                        Relaxed Phylogenetics and Dating with Confidence
## 2                                                  Human MicroRNA Targets
## 3 Projections of Global Mortality and Burden of Disease from 2002 to 2030

Using a chain of pipes, output the author count (authorsCount), title, journal, and subject tags (plosSubjectTags) of the three research articles with the largest number of authors.

research %>% arrange(desc(authorsCount)) %>%
  slice(1:3) %>%
  select(authorsCount, title, journal, plosSubjectTags)
##   authorsCount
## 1          158
## 2          144
## 3          120
##                                                                                                         title
## 1                           Integrative Annotation of 21,037 Human Genes Validated by Full-Length cDNA Clones
## 2 Genome-Wide Association Scan Meta-Analysis Identifies Three Loci Influencing Adiposity and Fat Distribution
## 3                    Common Genetic Variants and Modification of Penetrance of BRCA2-Associated Breast Cancer
##   journal                                  plosSubjectTags
## 1    pbio      Computational Biology|Genetics and Genomics
## 2    pgen Diabetes and Endocrinology|Genetics and Genomics
## 3    pgen                            Genetics and Genomics

How to create new columns? Use the funciton mutate.

research <- research %>%
  mutate(weeksSincePublished = daysSincePublished / 7,
         yearsSincePublished = weeksSincePublished / 52)

Summarize the data with summarize.

research %>% summarize(plos_mean = mean(plosCommentCount),
                       plos_sd = sd(plosCommentCount),
                       num = n(),
                       plos_sem = plos_sd / sqrt(num))
##   plos_mean  plos_sd   num    plos_sem
## 1 0.2642681 1.230676 21096 0.008473125
research %>% 
  group_by(journal, year) %>%
  summarize(plos_mean = mean(plosCommentCount),
            plos_sd = sd(plosCommentCount),
            num = n(),
            plos_sem = plos_sd / sqrt(num))
## Source: local data frame [42 x 6]
## Groups: journal [?]
## 
##    journal  year  plos_mean   plos_sd   num   plos_sem
##     <fctr> <int>      <dbl>     <dbl> <int>      <dbl>
## 1     pbio  2003 0.00000000 0.0000000    33 0.00000000
## 2     pbio  2004 0.03448276 0.1829922   174 0.01387260
## 3     pbio  2005 0.06818182 0.3133398   176 0.02361887
## 4     pbio  2006 0.10326087 0.4375668   184 0.03225785
## 5     pbio  2007 0.15270936 0.4988403   203 0.03501172
## 6     pbio  2008 0.20304569 0.6055785   197 0.04314568
## 7     pbio  2009 0.23204420 0.7826385   181 0.05817306
## 8     pbio  2010 0.16384181 0.6836092   177 0.05138321
## 9     pcbi  2005 0.09090909 0.3481553    55 0.04694525
## 10    pcbi  2006 0.06617647 0.2776143   136 0.02380523
## ..     ...   ...        ...       ...   ...        ...