forked from seanlhodges/WQualityStateTrend
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlawa_dataPrep_WFS.R
428 lines (335 loc) · 17.4 KB
/
lawa_dataPrep_WFS.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
#===================================================================================================
# LAWA DATA PREPARATION - WFS
# Horizons Regional Council
#
# 4 July 2018
#
# Sean Hodges
# Horizons Regional Council
#===================================================================================================
# Clearing workspace
rm(list = ls())
ANALYSIS<-"LOAD WFS"
# Set working directory
od <- getwd()
wd <- "\\\\file\\herman\\R\\OA\\08\\02\\2018\\Water Quality\\R\\lawa_state"
setwd(wd)
logfolder <- "\\\\file\\herman\\R\\OA\\08\\02\\2018\\Water Quality\\ROutput\\"
#/* -===Include required function libraries===- */
source("scripts/WQualityStateTrend/lawa_state_functions.R")
## Supplementary functions
ld <- function(url,dataLocation,case.fix=TRUE){
if(dataLocation=="web"){
str<- tempfile(pattern = "file", tmpdir = tempdir())
(download.file(url,destfile=str,method="wininet"))
xmlfile <- xmlParse(file = str)
unlink(str)
} else if(dataLocation=="file"){
cc(url)
message("trying file",url,"\nContent type 'text/xml'\n")
if(grepl("xml$",url)){
xmlfile <- xmlParse(url)
} else {
xmlfile=FALSE
}
}
return(xmlfile)
}
# Replace underscores in file - assuming underscores only in element names
us <- function(file){
str <- readLines(file)
y <- gsub(x=str,pattern = "[_]",replacement ="",perl = TRUE)
writeLines(y,file)
}
cc <- function(file){
x <- readLines(file)
y <- gsub( "SITEID", "SiteID", x, ignore.case = TRUE )
y <- gsub( "ELEVATION", "Elevation", y, ignore.case = TRUE )
y <- gsub( "COUNCILSITEID", "CouncilSiteID", y, ignore.case = TRUE )
y <- gsub( "LAWASITEID", "LawaSiteID", y, ignore.case = TRUE )
y <- gsub( "SWMANAGEMENTZONE", "SWManagementZone", y, ignore.case = TRUE )
y <- gsub( "GWMANAGEMENTZONE", "GWManagementZone", y, ignore.case = TRUE )
y <- gsub( "CATCHMENT", "Catchment", y, ignore.case = TRUE )
y <- gsub( "NZREACH", "NZReach", y, ignore.case = TRUE )
y <- gsub( "DESCRIPTION", "Description", y, ignore.case = TRUE )
y <- gsub( "PHOTOGRAPH", "Photograph", y, ignore.case = TRUE )
y <- gsub( "SWQUALITY", "SWQuality", y, ignore.case = TRUE )
y <- gsub( "SWQUALITYSTART", "SWQualityStart", y, ignore.case = TRUE )
y <- gsub( "SWQFREQUENCYALL", "SWQFrequencyAll", y, ignore.case = TRUE )
y <- gsub( "SWQFREQUENCYLAST5", "SWQFrequencyLast5", y, ignore.case = TRUE )
y <- gsub( "SWQALTITUDE", "SWQAltitude", y, ignore.case = TRUE )
y <- gsub( "SWQLANDUSE", "SWQLanduse", y, ignore.case = TRUE )
y <- gsub( "RWQUALITY", "RWQuality", y, ignore.case = TRUE )
y <- gsub( "RWQUALITYSTART", "RWQualityStart", y, ignore.case = TRUE )
y <- gsub( "LWQUALITY", "LWQuality", y, ignore.case = TRUE )
y <- gsub( "LWQUALITYSTART", "LWQualityStart", y, ignore.case = TRUE )
y <- gsub( "LTYPE", "LType", y, ignore.case = TRUE )
y <- gsub( "LFENZID", "LFENZID", y, ignore.case = TRUE )
y <- gsub( "MACRO", "Macro", y, ignore.case = TRUE )
y <- gsub( "MACROSTART", "MacroStart", y, ignore.case = TRUE )
y <- gsub( "SWQUANTITY", "SWQuantity", y, ignore.case = TRUE )
y <- gsub( "SWQUANTITYSTART", "SWQuantityStart", y, ignore.case = TRUE )
y <- gsub( "REGION", "Region", y, ignore.case = TRUE )
y <- gsub( "AGENCY", "Agency", y, ignore.case = TRUE )
y <- gsub( "ns2.", "", y, ignore.case = TRUE )
y <- gsub( "ns3.", "", y, ignore.case = TRUE )
writeLines(y,file)
}
# ======================================
# Load WFS locations from CSV
## Load csv with WFS addresses
urls2018 <- "//file/herman/R/OA/08/02/2018/Water Quality/R/lawa_state/CouncilWFS.csv"
urls <- read.csv(urls2018,stringsAsFactors=FALSE)
#urls$Agency[urls$Agency=="TDC"] <- "xTDC" ## Commenting out Tasman DC due to Hilltop Server issues
#urls2016 <- "//file/herman/R/OA/08/02/2016/Water Quality/R/lawa_state/CouncilWFS.csv"
#urls <- read.csv(urls2016,stringsAsFactors=FALSE)
stopGapNames <- read.csv("agencyRegion.csv",stringsAsFactors=FALSE)
# Drop BOPRC - GIS Server erroring
#urls <- urls[-2,]
# Drop GDC - Error on SErver
#urls <- urls[-5,]
#url <- "https://hbrcwebmap.hbrc.govt.nz/arcgis/services/emar/MonitoringSiteReferenceData/MapServer/WFSServer?request=GetFeature&service=WFS&typename=MonitoringSiteReferenceData&srsName=urn:ogc:def:crs:EPSG:6.9:4326"
#url = "http://gis.horizons.govt.nz/arcgis/services/emar/MonitoringSiteReferenceData/MapServer/WFSServer?request=GetFeature&service=WFS&typename=MonitoringSiteReferenceData"
# Config for data extract from WFS
vars <- c("SiteID","CouncilSiteID","LawaSiteID","SWQuality","SWQAltitude","SWQLanduse",
"SWQFrequencyAll","SWQFrequencyLast5","Region","Agency")
### Even though the field names have been defined in the documentation, there are still differences in Field Names specified by each Council
### Either
### 1. Define a method that determines the name of the elements in each WFS feed; OR
### 2. Note discrepencies as ERRORS and feedback to supplying Council.
### We'll go with option 2 for the moment.
### LOG START: output to ROutput folder
logfile <- paste(logfolder,"lawa_dataPrep_WFS.log",sep="")
sink(logfile)
###
for(h in 1:length(urls$URL)){
if(grepl("^x", urls$Agency[h])){
next
}
if(!nzchar(urls$URL[h])){ ## returns false if the URL string is missing
next
}
if(h==3){
next()
}
# Fixing case issue with attribute names with WRC
if(urls$Agency[h]=="WRC"){
str<- tempfile(pattern = "file", tmpdir = tempdir())
(download.file(urls$URL[h],destfile=str,method="wininet"))
cc(str)
xmldata <- xmlParse(file = str)
unlink(str)
} else if(urls$Agency[h]=="ECAN") {
# Fixing field name issue with Ecan
str<- tempfile(pattern = "file", tmpdir = tempdir())
(download.file(urls$URL[h],destfile=str,method="wininet"))
us(str)
xmldata <- xmlParse(file = str)
unlink(str)
} else {
#load up every other end point without needing to fix case in the file.
xmldata<-ld(urls$URL[h],urls$Source[h])
}
if(urls$Source[h]=="file" & grepl("csv$",urls$URL[h])){
cc(urls$URL[h])
tmp <- read.csv(urls$URL[h],stringsAsFactors=FALSE,strip.white = TRUE,sep=",")
tmp <- tmp[,c(5,7,8,9,30,10,28,29,20,21,22)]
tmp$Lat <- sapply(strsplit(as.character(tmp$pos),' '), "[",1)
tmp$Long <- sapply(strsplit(as.character(tmp$pos),' '), "[",2)
tmp <- tmp[-11]
if(!exists("siteTable")){
siteTable<-as.data.frame(tmp,stringsAsFactors=FALSE)
} else{
siteTable<-rbind.data.frame(siteTable,tmp,stringsAsFactors=FALSE)
}
rm(tmp)
} else {
### Determine the values used in the [emar:SWQuality] element
swq<-unique(sapply(getNodeSet(doc=xmldata, path="//emar:MonitoringSiteReferenceData/emar:SWQuality"), xmlValue))
ns <- "emar:"
## if emar namespace does not occur before TypeName in element,then try without namespace
## Hilltop Server v1.80 excludes name space from element with TypeName tag
if(length(swq)==0){
swq<-unique(sapply(getNodeSet(doc=xmldata, path="//MonitoringSiteReferenceData/emar:SWQuality"), xmlValue))
ns <- ""
}
# since it appears that the possible values for Yes,No, True, False, Y, N, T,F, true, false, yes, no all have the
# sample alphabetic order, Y, Yes, y, yes, True, true, T, t are always going to be item 2 in this character vector.
# Handy.
# Enforcing order in swq
swq<-swq[order(swq,na.last = TRUE)]
# Resolving case issue of values for SWQuality element returned from Ecan
if(identical(swq,c("NO","Yes","YES"))){
swq <- c("NO","YES")
}
if(length(swq)==2){
module <- paste("[emar:SWQuality='",swq[2],"']",sep="")
} else {
module <- paste("[emar:SWQuality='",swq,"']",sep="")
}
#xmltop<-xmlRoot(xmldata)
#c <- length(xmlSApply(xmltop, xmlSize)) # number of children for i'th E Element inside <Data></Data> tags
cat("\n",urls$Agency[h],"\n---------------------------\n",urls$URL[h],"\n",module,"\n",sep="")
# Determine number of records in a wfs with module before attempting to extract all the necessary columns needed
if(length(sapply(getNodeSet(doc=xmldata,
path=paste("//",ns,"MonitoringSiteReferenceData",module,"/emar:",vars[1],sep="")), xmlValue))==0){
cat(urls$Agency[h],"has no records for <emar:SWQuality>\n")
} else {
# We declared vars earlier. Next section of code goes and gets these values from the WFS
# in sequence
#vars <- c("SiteID","CouncilSiteID","LawaSiteID","SWQuality","SWQAltitude","SWQLanduse",
# "SWQFrequencyAll","SWQFrequencyLast5","Region","Agency")
for(i in 1:length(vars)){
if(i==1){
# for the first URL
a<- sapply(getNodeSet(doc=xmldata,
path=paste("//emar:LawaSiteID/../../",ns,"MonitoringSiteReferenceData",module,"/emar:",vars[i],sep="")), xmlValue)
cat(vars[i],":\t",length(a),"\n")
#Cleaning var[i] to remove any leading and trailing spaces
trimws(a)
nn <- length(a)
} else {
# for all subsequent URL's
b<- sapply(getNodeSet(doc=xmldata,
path=paste("//emar:LawaSiteID/../../",ns,"MonitoringSiteReferenceData",module,"/emar:",vars[i],sep="")), xmlValue)
cat(vars[i],":\t",length(b),"\n")
if(length(b)==0){
if(vars[i]=="Region"){
b[1:nn] <-stopGapNames[stopGapNames$Agency==urls$Agency[h],2]
} else if(vars[i]=="Agency"){
b[1:nn]<-stopGapNames[stopGapNames$Agency==urls$Agency[h],1]
} else {
b[1:nn]<-""
}
}
#Cleaning b to remove any leading and trailing spaces
trimws(b)
a <- cbind(unlist(a),unlist(b))
}
}
a <- as.data.frame(a,stringsAsFactors=FALSE)
### grab the latitude and longitude values (WFS version must be 1.1.0)
latlong <- sapply(getNodeSet(doc=xmldata,
path=paste("//gml:Point[../../../",ns,"MonitoringSiteReferenceData",module,"]",sep="")), xmlValue)
latlong <- sapply(getNodeSet(doc=xmldata,
path=paste("//gml:Point[../../emar:LawaSiteID/../../",ns,"MonitoringSiteReferenceData",module,"]",sep="")), xmlValue)
llSiteName <- sapply(getNodeSet(doc=xmldata,
path=paste("//gml:Point[../../emar:LawaSiteID/../../emar:MonitoringSiteReferenceData",module,"]",
"/../../../",ns,"MonitoringSiteReferenceData/emar:CouncilSiteID",sep="")), xmlValue)
latlong <- simplify2array(strsplit(latlong," "))
rm(b,xmldata)
if(nrow(a)==length(latlong[1,])){
a <- cbind.data.frame(a,as.numeric(latlong[1,]),as.numeric(latlong[2,]))
} else {
b <- as.data.frame(matrix(latlong,ncol=2,nrow=length(latlong[1,]),byrow=TRUE))
b <- cbind.data.frame(b,llSiteName,stringsAsFactors=FALSE)
names(b) <- c("Lat","Long","CouncilSiteID")
#Cleaning CouncilSiteID to remove any leading and trailing spaces
b$CouncilSiteID <- trimws(b$CouncilSiteID)
#b$SiteID <- trimws(b$SiteID)
cat("Only",length(latlong[1,]),"out of",nrow(a),"sites with lat-longs.\nSome site locations missing\n")
#if(h==11){ # Change back to 11 once BOPRC included again
if(h==12){ # Northland - might be case for all other councils too. Verify
a <- merge(a,b,by.x="V2",by.y="CouncilSiteID",all.x=TRUE)
} else {
a <- merge(a,b,by.x="V1",by.y="CouncilSiteID",all.x=TRUE)
}
}
rm(latlong)
#a<-as.data.frame(a,stringsAsFactors=FALSE)
names(a)<-c(vars,"Lat","Long")
if(!exists("siteTable")){
siteTable<-as.data.frame(a,stringsAsFactors=FALSE)
} else{
siteTable<-rbind.data.frame(siteTable,a,stringsAsFactors=FALSE)
}
rm(a)
}
cat("\n---------------------------\n\n",sep="")
}
}
### LOG FINISH: output to ROutput folder
sink()
###
# For some reason, the lat/longs for Wairua at Purua are being loaded into columns as a 2 item vector - the first item being NA, the second being the value
# The next two lines sort the problem, but if the problem should be resolved, these two lines should error.
#siteTable$Lat[siteTable$LawaSiteID=="NRWQN-00022"] <- siteTable$Lat[siteTable$LawaSiteID=="NRWQN-00022"][2]
#siteTable$Long[siteTable$LawaSiteID=="NRWQN-00022"] <-siteTable$Long[siteTable$LawaSiteID=="NRWQN-00022"][2]
#dropping Northland for the timebeing 7-Sep-2016
#siteTable <- siteTable[siteTable$Region!="Northland",]
#Converting values in the frequency columns to Title case
#From http://www.johnmyleswhite.com/notebook/2009/02/25/text-processing-in-r/
pseudo.titlecase = function(str)
{
substr(str, 1, 1) = toupper(substr(str, 1, 1))
return(str)
}
siteTable$SWQFrequencyAll <- pseudo.titlecase(siteTable$SWQFrequencyAll )
siteTable$SWQFrequencyLast5 <- pseudo.titlecase(siteTable$SWQFrequencyLast5 )
#
# #Editing the BOPRC values as landuse and altitude values are missing
# boprc <- siteTable[siteTable$Agency=="BOPRC",]
# siteAltLand <- read.csv("//file/herman/r/oa/08/02/2017/Water Quality/1.AsSupplied/BOP/siteAltLand.csv")
# boprc <- merge(boprc,siteAltLand,by.x="LawaSiteID",by.y="LAWAID",all.x=TRUE)
# boprc$SWQLanduse <- boprc$LanduseGroup
# boprc$SWQAltitude <- boprc$AltitudeGroup
#
# boprc <- boprc[,c(1:12)]
#
# n <- names(boprc)
# n[9] <- "Region"
# names(boprc) <- n
#
# siteTable <- siteTable[siteTable$Region!="Bay of Plenty",]
# siteTable <- rbind(siteTable,boprc)
#
# rm(boprc,n,siteAltLand)
#Load NIWA Site data from csv
niwaSites <- read.csv("\\\\file\\herman\\R\\OA\\08\\02\\2018\\Water Quality\\0.AsSupplied\\NIWA\\SiteList.csv",stringsAsFactors=FALSE)
niwaSites <- niwaSites[,c(1:53)]
#Editing the NIWA values as landuse and altitude values are missing
#NIWA <- siteTable[siteTable$Agency=="NIWA",]
siteAltLand <- read.csv("//file/herman/r/oa/08/02/2018/Water Quality/0.AsSupplied/NIWA/siteAltLand.txt")
NIWA <- merge(niwaSites,siteAltLand,by.x="LawaSiteID",by.y="LAWAID",all.x=TRUE)
# selecting fields in NIWA dataframe that match those in the siteTable dataframe
NIWA <- NIWA[,c(2,6,1,13,55,56,15,16,53,52,4,3)]
names(NIWA) <- names(siteTable)
#siteTable <- siteTable[siteTable$Agency!="NIWA",]
siteTable <- rbind(siteTable,NIWA)
rm(NIWA,niwaSites,siteAltLand)
## Changing BOP Site names that use extended characters
## Waiōtahe at Toone Rd LAWA-100395 Waiotahe at Toone Rd
## Waitahanui at Ōtamarākau Marae EBOP-00038 Waitahanui at Otamarakau Marae
siteTable$SiteID[siteTable$LawaSiteID=="LAWA-100395"] <- "Waiotahe at Toone Rd"
siteTable$SiteID[siteTable$LawaSiteID=="EBOP-00038"] <- "Waitahanui at Otamarakau Marae"
## A better solution would be to deal directly with the characters and bulk convert to plain ascii text, rather than simply
## discovering sites with issues and renaming them manually
#siteTable <- read.csv(file = "LAWA_Site_Table.csv",stringsAsFactors=FALSE)
#siteTable <- siteTable[,c(2:13)]
#
# #Editing Southland data - error in LawaSiteID for one record
# # ES-00165\nES-00165
# sum(grepl("^ES-00165",x = siteTable$LawaSiteID))
# siteTable$LawaSiteID[grepl("^ES-00165",x = siteTable$LawaSiteID)] <- "ES-00165"
## Swapping coordinate values for Agency=Environment Canterbury Regional Council, Christchurch
agencies <- c("Environment Canterbury","Christchurch","TRC")
for(a in 1:length(agencies)){
lon <- siteTable$Lat[siteTable$Agency==agencies[a]]
siteTable$Lat[siteTable$Agency==agencies[a]] <- siteTable$Long[siteTable$Agency==agencies[a]]
siteTable$Long[siteTable$Agency==agencies[a]]=lon
}
#siteTable$Long[siteTable$LawaSiteID=="NRWQN-00022"] <-siteTable$Long[siteTable$LawaSiteID=="NRWQN-00022"][2]
# For WCRC-00031 - location is wrong in WFS
# NZTM coordinates from WCRC website: 1466541,5295450
# WGS84, now: Latitude Longitude -42.48179737 171.37623113
siteTable$Lat[siteTable$LawaSiteID=="WCRC-00031"] <- -42.48179737
siteTable$Long[siteTable$LawaSiteID=="WCRC-00031"] <- 171.37623113
## Correcting variations in Region names
siteTable$Region[siteTable$Region=="BayOfPlenty"] <- "Bay of Plenty"
siteTable$Region[siteTable$Region=="WaikatoRegion"] <- "Waikato"
siteTable$Region[siteTable$Region=="HawkesBay"] <- "Hawkes Bay"
siteTable$Region[siteTable$Region=="WestCoast"] <- "West Coast"
## Output for next script
write.csv(x = siteTable,file = "LAWA_Site_Table.csv")
#write.csv(x = siteTable,file = "LAWA_Site_Table1.csv")
write.csv(x = siteTable,file = "LAWA_Site_Table_WFS_PULL.csv")