For Windows users getting a EOF error, try the following:
counts_raw <- read.delim("data/counts-raw.txt.gz", quote = "",
check.names = FALSE)
colnames(counts_raw) <- gsub("\"", "", colnames(counts_raw))
colnames(counts_raw)
## [1] "doi" "pubDate"
## [3] "journal" "title"
## [5] "articleType" "authorsCount"
## [7] "f1000Factor" "backtweetsCount"
## [9] "deliciousCount" "pmid"
## [11] "plosSubjectTags" "plosSubSubjectTags"
## [13] "facebookShareCount" "facebookLikeCount"
## [15] "facebookCommentCount" "facebookClickCount"
## [17] "mendeleyReadersCount" "almBlogsCount"
## [19] "pdfDownloadsCount" "xmlDownloadsCount"
## [21] "htmlDownloadsCount" "almCiteULikeCount"
## [23] "almScopusCount" "almPubMedCentralCount"
## [25] "almCrossRefCount" "plosCommentCount"
## [27] "plosCommentResponsesCount" "wikipediaCites"
## [29] "year" "daysSincePublished"
## [31] "wosCountThru2010" "wosCountThru2011"
For everybody else:
counts_raw <- read.delim("data/counts-raw.txt.gz")
is.na(counts_raw$authorsCount[1:10])
## [1] FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE
counts_raw$authorsCount[1:10] > 7
## [1] FALSE TRUE NA NA FALSE TRUE NA NA NA FALSE
counts_raw$authorsCount[counts_raw$backtweetsCount > 1]
## [1] NA 6 7 3 NA 7 NA 5 3 2 NA 2 NA 6 NA NA 10
## [18] NA NA NA 2 9 NA NA NA 18 5 NA 3 NA NA NA 2 4
## [35] NA 4 2 5 2 5 5 4 NA 1 6 4 2 5 14 NA 6
## [52] 6 3 1 8 4 6 15 NA 4 11 2 3 15 3 22 25 18
## [69] 7 12 3 120 4 NA NA NA NA 6 7 NA NA NA NA 16 NA
## [86] NA NA 6 NA 13 NA NA NA NA 7 4 NA 8 NA NA NA 15
## [103] NA NA 4 8 8 20 NA NA 8 9 2 3 6 3 8 5 6
## [120] 1 4 9 5 9 3 6 6 3 2 5 19 3 2 5 4 6
## [137] 1 11 9 8 6 11 3 5 15 5 3 5 9 5 4 3 6
## [154] 2 12 5 4 9 4 2 5 7 13 15 4 2 7 4 3 9
## [171] 9 8 7 6 1 7 5 8 2 4 2 5 4 3 5 5 13
## [188] 2 12 8 10 5 7 3 3 2 5 5 13 2 5 5 6 8
## [205] 10 1 12 3 6 2 3 4 2 8 19 6 4 3 5 1 9
## [222] 5 12 3 3 10 8 8 4 7 7 4 10 7 1 6 3 13
## [239] 9 8 13 8 6 7 5 3 7 2 7 8 10 8 16 10 5
## [256] 8 4 3 2 4 4 8 4 4 7 9 5 5 2 7 8 7
## [273] 6 4 2 4 5 5 3 13 4 5 4 6 5 5 6 6 3
## [290] 7 5 3 2 8 1 3 7 8 4 2 6 2 7 5 7 7
## [307] 8 8 7 16 9 10 3 10 7 1 5 2 17 9 6 7 4
## [324] 11 9 9 6 5 3 2 3 3 3 6 3 3 16 2 2 25
## [341] 6 4 6 3 5 4 7 5 5 9 2 5 3 2 8 2 2
## [358] 8 1 15 7 16 5 5 8 11 11 10 8 11 2 7 7 13
## [375] 8 9 7 3 12 8 5 13 3 8 3 4 6 9 4 2 11
## [392] 5 8 6 10 7 8 6 6 6 13 9 5 3 7 4 9 5
## [409] 6 8 4 3 3 4 7 11 5 8 4 8 7 4 3 4 6
## [426] 4 3 5 7 5 5 8 14 8 11 8 6 2 7 12 3 4
## [443] 5 3 4 4 4 9 8 9 2 2 4 3 15 5 5 7 10
## [460] 2 4 5 3 8 2 5 10 7 3 11 13 2 5 14 2 9
## [477] 7 7 3 5 8 9 13 8 6 6 3 4 3 5 11 14 5
## [494] 4 4 6 5 6 10 3 5 7 4 7 5 4 14 1 3 11
## [511] 11 2 12 17 3 10 2 6 7 4 5 3 7 3 3 16 8
## [528] 4 8 3 7 9 9 7 4 13 5 4 3 2 16 8 5 2
## [545] 9 3 7 4 6 5 6 4 5 9 12 2 3 3 9 8 3
## [562] 11 11 6 3 6 12 4 4 8 15 7 2 4 2 2 6 3
## [579] 3 11 16 4 17 7 3 6 5 9 2 6 14 3 3 2 9
## [596] 5 4 7 6 5 10 6 3 2 7 3 5 5 1 1 5 6
## [613] 5 6 5 3 4 3 3 8 3 2 6 3 9 5 5 2 2
## [630] 7 8 2 3 11 5 7 1 2 6 21 7 4 3 4 3 7
## [647] 3 7 8 2 3 5 3 14 15 18 13 1 4 4 18 7 13
## [664] 13 2 3 2 4 7 2 16 7 9 5 12 5 27 2 9 4
## [681] 3 2 14 3 3 13 7 4 4 4 7 3 6 10 1 7 6
## [698] 5 6 17 11 6 5 3 7 7 4 3 8 4 3 5 4 6
## [715] 5 2 11 1 7 5 6 4 15 2 2 9 4 10 19 3 12
## [732] 3 7 7 5 3 3 9 2 5 4 10 8 6 3 10 2 2
## [749] 5 5 4 5 2 4 4 4 1 3 3 2 9 5 3 6 2
## [766] 6 2 27 4 7 7 5 8 2 8 4 11 5 3 6 11 5
## [783] 19 5 2 5 14 3 6 5 6 5 18 8 4 5 3 6 8
## [800] 10 3 1 2 5 2 2 10 6 7 5 4 7 8 6 5 4
## [817] 2 6 5 2 4 11 4 9 11 9 9 6 6 7 3 5 6
## [834] 2 6 3 5 8 5 8 6 10 3 2 2 18 4 3 7 5
## [851] 4 16 2 4 3 3 2 16 3 6 7 17 5 3 7 5 10
## [868] 3 2 2 5 4 7 6 3 6 4 7 5 4 6 9 2 13
## [885] 3 2 4 6 5 5 6 3 7 12 12 3 4 4 5 7 3
## [902] 7 9 2 2 7 6 5 3 3 4 3 8 6 3 5 3 4
## [919] 9 3 4 15 11 7 13 3 6 5 3 9 8 8 2 5 4
## [936] 6 17 13 7 8 10 9 9 7 3 2 4 3 5 3 10 3
## [953] 4 3 10 4 4 7 8 4 3 2 7 3 3 3 5 8 12
## [970] 3 2 1 4 2 3 2 9 4 5 6 6 1 8 6 2 13
## [987] 5 3 4 9 5 5 5 8 39 4 7 4 7 5 4 10 5
## [1004] 3 7 8 8 12 5 4 3 5 3 4 4 4 9 6 2 9
## [1021] 8 8 16 5 6 5 14 4 2 5 11 21 8 4 7 5 12
## [1038] 2 6 5 4 7 NA NA 10 15 12 6 NA 14 20
dim(counts_raw[counts_raw$journal == "pone", ])
## [1] 14099 32
& -> and | -> or %in% -> in
dim(counts_raw[counts_raw$journal == "pone" |
counts_raw$journal == "pgen", ])
## [1] 15868 32
dim(counts_raw[counts_raw$journal %in%
c("pone", "pgen", "pmed"),])
## [1] 17826 32
dim(counts_raw[!(counts_raw$journal %in%
c("pone", "pgen", "pmed")),])
## [1] 6505 32
== -> equals != -> not equal
the function grepl
head(counts_raw$plosSubjectTags)
## [1] Cell Biology|Immunology|Molecular Biology
## [2] Biotechnology|Genetics and Genomics|Infectious Diseases|Virology
## [3] Computational Biology|Biotechnology|Genetics and Genomics|Infectious Diseases|Virology
## [4] Cell Biology|Immunology|Molecular Biology
## [5] Genetics and Genomics|Infectious Diseases|Microbiology
## [6] Ecology|Evolutionary Biology|Genetics and Genomics
## 6715 Levels: Anesthesiology and Pain Management ...
dim(counts_raw[grepl("Immunology", counts_raw$plosSubjectTags), ])
## [1] 2708 32