forked from brianhigh/get-pems
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfreeway-detector-health.R
245 lines (205 loc) · 10.6 KB
/
freeway-detector-health.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# Retrieve freeway detector data from the State of California PeMS website.
#
# * Looks up detector health data for a set of days and a set of freeways.
# * Please configure as needed. See the "Configuration" section for details.
#
# Copyright Brian High (https://github.com/brianhigh) and Surakshya Dhakal
# License: GNU GPL v3 http://www.gnu.org/licenses/gpl.txt
# Close connections and clear objects.
closeAllConnections()
rm(list=ls())
# Load one or more packages into memory, installing as needed.
load.pkgs <- function(pkgs, repos = "http://cran.r-project.org") {
result <- sapply(pkgs, function(pkg) {
if (!suppressWarnings(require(pkg, character.only = TRUE))) {
install.packages(pkg, quiet = TRUE, repos = repos)
library(pkg, character.only = TRUE)}})
}
# Install packages and load into memory.
load.pkgs(c("RCurl", "XML", "plyr"))
# --------------------------------------------------------------------------
# Configuration
# --------------------------------------------------------------------------
# Data folder configuration - where the data files are to be stored
data.folder <- 'data'
# Session configuration - variables used to set up the HTTP session
# You should only need to change the first two (username and password).
username <- '[email protected]'
password <- 's3kr!t'
base.url <- 'http://pems.dot.ca.gov'
user.agent <- 'Mozilla/5.0' # https://en.wikipedia.org/wiki/User_agent
cookies <- 'cookies.txt' # https://en.wikipedia.org/wiki/HTTP_cookie
# Lanes configuration - specific freeway and direction to query
# - Freeway-lane entries must be listed as one entry per line
# - Entries much match this "regex": ^(?:I|SR|US)\\d+[NSEW]?-[NSEW]{1}$
# - Where ^(?:I|SR|US) means: starts with I or SR or US
# - And \\d+[NSEW]?- means:
# - one or more digits
# - *optionally* followed by a single N or S or E or W
# - followed by a single dash
# - And [NSEW]{1}$ means: ends with a single N or S or E or W
# - Example: SR24-W
# - Example: I880S-S
freeways.of.interest.file <- "freeways_of_interest.txt"
# Start date configuration - a vector of a single date or multiple dates
# - query date(s) must be in ISO 8601 form: YYYY-MM-DD
# - See: https://en.wikipedia.org/wiki/ISO_8601
# - Dates much match this "regex": '^\\d{4}-\\d{2}-\\d{2}$'
# - Where this means: four digits, a dash, two digits, a dash, and two digits
# Examples:
# search.date <- c('2016-02-05')
# search.date <- c('2016-02-05', '2016-02-06', '2016-02-07')
# search.date <- seq(as.Date("2015-01-01"), as.Date("2015-12-31"), "days")
search.date <- seq(as.Date("2015-01-01"), as.Date("2015-01-03"), "days")
# Read in configuration file. This file can contain the settings listed above.
if (file.exists("conf.R")) source("conf.R")
# --------------------------------------------------------------------------
# Functions
# --------------------------------------------------------------------------
## Function getDetectorHealthPage will fetch a freeway's detector health page
getDetectorHealthPage <- function(freeway, direction, search.date.str,
s.time.id, curl, base.url, data.folder) {
# Combine variables into a "lane" string
lane <- paste('fwy=', freeway, '&dir=', direction, sep='')
# Parse the search.date.str into a vector
search.date.v <- unlist(strsplit(search.date.str, '-'))
names(search.date.v) <- c("year", "month", "day")
# Combine variables into a "start date" (sdate) string
sdate <- paste(search.date.v[['month']],
search.date.v[['day']],
search.date.v[['year']],
sep='%2F')
# Combine variables into a "file date" (fdate) string
fdate <- paste(search.date.v[['year']],
search.date.v[['month']],
search.date.v[['day']],
sep='')
# Page configuration - query specification for type of report page
form.num <- '1'
node.name <- 'Freeway'
content <- 'detector_health'
export.type <- 'text'
# Combine variables into a "page" (page) string
page <- paste('report_form=', form.num, '&dnode=', node.name, '&content=',
content, '&export=', export.type, sep='')
# Combine variables into a "output filename" (output.filename) string
output.filename <- paste(data.folder, '/', node.name, '-',
content, '-', freeway, '-', direction, '-', fdate,
'.tsv', sep='')
# If the data filehas alread been saved, load the file, or get from web
cat(freeway, "-", direction, " ")
if (! file.exists(output.filename)) {
tryCatch({
# Get TSV file for the detector_health for chosen freeway and date
r.url <- paste(base.url, '/?', page, '&', lane, '&s_time_id=',
s.time.id, '&s_time_id_f=', sdate, sep='')
# Get TSV data file from website and store as a string in memory
r = dynCurlReader()
result.string <- getURL(url = r.url, curl = curl)
# Write string to file
writeLines(result.string, output.filename)
# Read table from string into a dataframe
health <- read.table(text=result.string, sep='\t', header=T,
fill=T, quote='', stringsAsFactors=F)
}, error=function(e) {
cat("ERROR :",conditionMessage(e), "\n")
return(data.frame(NULL))
})
} else {
# Read from file
health <- read.table(output.filename, sep='\t', header=T, fill=T,
quote='', stringsAsFactors=F)
}
tryCatch({
# Add variables to make this dataset unique from others and return
if (nrow(health) > 0) health$search.date <- as.Date(search.date.str)
return(health)
}, error=function(e) {
cat("ERROR :",conditionMessage(e), "\n")
return(data.frame(NULL))
})
}
## Function getDetectorHealth will fetch the detector health for each freeway
# in the dataframe "freeways" for a specific date "search.date.str"
getDetectorHealth <- function(freeways, search.date.str, curl,
base.url, data.folder) {
cat("\n", "Trying ", search.date.str, "...", "\n")
# Calculate s_time_id (Unix time integer) from search.date.str
s.time.id <- as.character(as.integer(as.POSIXct(search.date.str,
origin="1970-01-01",
tz = "GMT")))
detector.health <- adply(.data=freeways, .margins=c(1),
.fun=function(x) getDetectorHealthPage(
x$freeway, x$direction, search.date.str,
s.time.id, curl, base.url, data.folder))
}
## Function subsetFreeways will subset freeways by those of interest
subsetFreeways <- function(freeways, freeways.of.interest.file) {
# If there is a freeways_of_interest file, subset freeways by its contents.
if (file.exists(freeways.of.interest.file)) {
freeways.of.interest <- readLines(freeways.of.interest.file)
# Remove quotation marks, if present
freeways.of.interest <- gsub('["\\\']', '', freeways.of.interest)
# Remove any which do not match the required format
freeways.of.interest <- freeways.of.interest[
grep('^(?:I|SR|US)\\d+[NSEW]?-[NSEW]{1}$', freeways.of.interest)]
# Convert to dataframe and merge with "freeways" to perform subset
freeways.of.interest <- data.frame(name=freeways.of.interest,
stringsAsFactors=F)
freeways <- merge(freeways, freeways.of.interest, by = "name")
}
return(freeways)
}
## Function getFreeways() finds "freeway" choices in HTML select option tags.
# Note: You could also extract available cities and counties with this method.
getFreeways <- function(doc) {
optValues <- xpathSApply(htmlParse(doc),
'//form[@class="crossNav"]/select[@name="url"]/option',
function(x) paste(xmlAttrs(x)["value"],
'&name=', xmlValue(x), sep=''))
freeways <- optValues[grepl('dnode=Freeway', optValues)]
freeways <- strsplit(gsub("[^&]*=", "", freeways), '&')
freeways <- adply(.data=unname(freeways), .margins=c(1))
freeways <- freeways[, c(3:5)]
names(freeways) <- c("freeway", "direction", "name")
return(freeways)
}
# --------------------------------------------------------------------------
# Main routine
# --------------------------------------------------------------------------
# Create the data folder if needed.
dir.create(file.path(data.folder), showWarnings = FALSE, recursive = TRUE)
# Load homepage, get a cookie, and parse output for a dataframe of freeways.
# From this dataframe, we can look-up the freeway number and lane directions.
# We can loop-through this dataframe (or a subset) to process many freeways.
formdata <- paste('redirect=&username=', username, '&password=', password,
'&login=Login', sep='')
# Configure curl to use a cookie file and a custom user agent string.
curl <- getCurlHandle(cookiefile = cookies, cookiejar = cookies,
useragent = user.agent)
# Load the page into R as a string.
r = dynCurlReader()
res <- curlPerform(postfields = formdata, url = base.url, curl = curl,
post = 1L, writefunction = r$update)
result.string <- r$value()
# Parse the page to get freeway data and write to a CSV file.
freeways <- getFreeways(result.string)
write.csv(freeways, paste(data.folder, "freeways.csv", sep="/"), row.names=F)
# Select only those freeways which are of interest to us.
freeways <- subsetFreeways(freeways, freeways.of.interest.file)
# Get detector health for each date and freeway and write to a CSV file.
search.date <- as.character(search.date)
search.date <- search.date[grep('^\\d{4}-\\d{2}-\\d{2}$', search.date)]
detector.health <- adply(.data=search.date, .margins=c(1),
.fun=function(x) getDetectorHealth(freeways, x, curl,
base.url,
data.folder))
# Remove extra variables.
detector.health$X1 <- NULL
detector.health$name <- NULL
# Save dataframe as CSV.
write.csv(detector.health, paste(data.folder, "detector_health.csv", sep="/"),
row.names=F)
# Clean up. Cookies file will be written to disk. Memory will be freed.
rm(curl)
gc()