-
Notifications
You must be signed in to change notification settings - Fork 4
/
cwm_string.py
executable file
·381 lines (297 loc) · 12.7 KB
/
cwm_string.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
#! /usr/bin/python
"""
$Id: cwm_string.py,v 1.41 2013-11-02 16:23:24 timbl Exp $
String built-ins for cwm
This started as http://www.w3.org/2000/10/swap/string.py
See cwm.py
"""
import string
import re
from diag import verbosity, progress
import urllib # for hasContent
from term import LightBuiltIn, ReverseFunction, Function, UnknownType
from local_decimal import Decimal
LITERAL_URI_prefix = "data:text/n3;"
STRING_NS_URI = "http://www.w3.org/2000/10/swap/string#"
###############################################################################################
#
# S T R I N G B U I L T - I N s
#
# This should be in a separate module, imported and called once by the user
# to register the code with the store
#
# Light Built-in classes
class BI_GreaterThan(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (subj.string > obj.string)
class BI_NotGreaterThan(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (subj.string <= obj.string)
class BI_LessThan(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (subj.string < obj.string)
class BI_NotLessThan(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (subj.string >= obj.string)
class BI_StartsWith(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return subj.string.startswith(obj.string)
class BI_EndsWith(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return subj.string.endswith(obj.string)
# Added, SBP 2001-11:-
class BI_Contains(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return subj.string.find(obj.string) >= 0
class BI_ContainsIgnoringCase(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return subj.string.lower().find(obj.string.lower()) >= 0
class BI_ContainsRoughly(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return normalizeWhitespace(subj.string.lower()).find(normalizeWhitespace(obj.string.lower())) >= 0
class BI_DoesNotContainRoughly(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return normalizeWhitespace(subj.string.lower()).find(normalizeWhitespace(obj.string.lower())) < 0
class BI_DoesNotContain(LightBuiltIn): # Converse of the above
def eval(self, subj, obj, queue, bindings, proof, query):
return subj.string.find(obj.string) < 0
class BI_equalIgnoringCase(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (subj.string.lower() == obj.string.lower())
class BI_notEqualIgnoringCase(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (string.lower(subj.string) != string.lower(obj.string))
def normalizeWhitespace(s):
"Normalize whitespace sequences in a string to single spaces"
res = ""
for ch in s:
if ch in " \t\r\n":
if res[-1:]!=" ": res = res + " "
else:
res = res + ch
return res
# String Constructors - more light built-ins
make_string = unicode
class BI_concat(LightBuiltIn, ReverseFunction):
def evaluateSubject(self, obj_py):
if verbosity() > 80: progress("Concat input:"+`obj_py`)
str = ""
for x in obj_py:
if not isString(x): return None # Can't
str = str + x
return str
class BI_concatenation(LightBuiltIn, Function):
def evaluateObject(self, subj_py):
if verbosity() > 80: progress("Concatenation input:"+`subj_py`)
str = ""
for x in subj_py:
if not isString(x):
if type(x) == type(long()) or isinstance(x, Decimal):
x = make_string(x)
else:
x = `x`
if verbosity() > 34: progress("Warning: Coercing to string for concat:"+`x`)
# return None # Can't
str = str + x
return str
def evalObj45(self, subj, queue, bindings, proof, query):
# raise RuntimeError('I got here!')
subj_py = list(subj)
if verbosity() > 80: progress("Concatenation input:"+`subj_py`)
retVal = []
for x in subj_py:
try:
val = x.value()
if not isString(val):
if type(val) == type(long()) or isinstance(val, Decimal):
val = make_string(val)
else:
val = `val`
if verbosity() > 34: progress("Warning: Coercing to string for concat:"+`val`)
retVal.append(val)
except UnknownType:
progress("Warning: Coercing to string for concat:"+`x`)
retVal.append(x.string)
return subj.store.newLiteral(''.join(retVal))
class BI_scrape(LightBuiltIn, Function):
"""a built-in for scraping using regexps.
takes a list of 2 strings; the first is the
input data, and the second is a regex with one () group.
Returns the data matched by the () group.
see also: test/includes/scrape1.n3
Hmm... negative tests don't seem to work.
"""
def evaluateObject(self, subj_py):
if verbosity() > 80: progress("scrape input:"+`subj_py`)
str, pat = subj_py
patc = re.compile(pat)
m = patc.search(str)
if m:
if verbosity() > 80: progress("scrape matched:"+m.group(1))
return m.group(1)
if verbosity() > 80: progress("scrape didn't match")
class BI_replace(LightBuiltIn, Function):
"""A built-in for replacing characters or sub.
takes a list of 3 strings; the first is the
input data, the second the old and the third the new string.
The object is calculated as the rplaced string.
For example, ("fofof bar", "of", "baz") string:replace "fbazbaz bar".
"""
def evaluateObject(self, subj_py):
str, old, new = subj_py
return str.replace(old, new)
class BI_search(LightBuiltIn, Function):
"""a more powerful built-in for scraping using regexps.
takes a list of 2 strings; the first is the
input data, and the second is a regex with one or more () group.
Returns the list of data matched by the () groups.
see also: test/includes/search.n3
"""
def evaluateObject(self, subj_py):
# raise Error
store = self.store
if verbosity() > 80: progress("search input:"+`subj_py`)
str, pat = subj_py
patc = re.compile(pat)
m = patc.search(str)
if m:
if verbosity() > 80: progress("search matched:"+m.group(1))
return m.groups()
if verbosity() > 80: progress("search didn't match")
class BI_split(LightBuiltIn, Function):
"""split a string into a list of strings
takes a list of 2 strings and an integer; the first is the
input data, and the second is a regex
see re.split in http://docs.python.org/lib/node46.html
"""
def evaluateObject(self, subj_py):
store = self.store
str, pat, q = subj_py
patc = re.compile(pat)
return patc.split(str, q)
class BI_tokenize(LightBuiltIn, Function):
"""like split without the max arg
"""
def evaluateObject(self, subj_py):
store = self.store
str, pat = subj_py
patc = re.compile(pat)
return patc.split(str)
class BI_normalize_space(LightBuiltIn, Function):
"""Returns the value of $arg with whitespace normalized by
stripping leading and trailing whitespace and replacing sequences
of one or more than one whitespace character with a single space,
#x20 -- http://www.w3.org/2006/xpath-functions#normalize-space
"""
def evaluateObject(self, subj_py):
store = self.store
return ' '.join(subj_py.split())
class BI_stringToList(LightBuiltIn, Function, ReverseFunction):
"""You need nothing else. Makes a string a list of characters, and visa versa.
"""
def evaluateObject(self, subj_py):
print "hello, I'm at it"
try:
return [a for a in subj_py]
except TypeError:
return None
def evaluateSubject(self, obj_py):
try:
return "".join(obj_py)
except TypeError:
return None
class BI_format(LightBuiltIn, Function):
"""a built-in for string formatting,
ala python % or C's sprintf or common-lisp's format
takes a list; the first item is the format string, and the rest are args.
see also: test/@@
"""
def evaluateObject(self, subj_py):
return subj_py[0] % tuple(subj_py[1:])
class BI_matches(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (re.compile(obj.string).search(subj.string))
class BI_notMatches(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (not re.compile(obj.string).search(subj.string))
dataEsc = re.compile(r"[\r<>&]") # timbl removed \n as can be in data
attrEsc = re.compile(r"[\r<>&'\"\n]")
class BI_xmlEscapeData(LightBuiltIn, Function):
"""Take a unicode string and return it encoded so as to pass in an XML data
You will need the BI_xmlEscapeAttribute on for attributes, escaping quotes."""
def evaluateObject(self, subj_py):
return xmlEscape(subj_py, dataEsc)
class BI_xmlEscapeAttribute(LightBuiltIn, Function):
"""Take a unicode string and return it encoded so as to pass in an XML data
You may need stg different for attributes, escaping quotes."""
def evaluateObject(self, subj_py):
return xmlEscape(subj_py, attrEsc)
def xmlEscape(subj_py, markupChars):
"""Escape a string given a regex of the markup chars to be escaped
from toXML.py """
i = 0
result = ""
while i < len(subj_py):
m = markupChars.search(subj_py, i)
if not m:
result = result + subj_py[i:]
break
j = m.start()
result = result + subj_py[i:j]
result = result + ("&#%d;" % (ord(subj_py[j]),))
i = j + 1
return result
class BI_encodeForURI(LightBuiltIn, Function):
"""Take a unicode string and return it encoded so as to pass in an
URI path segment. See
http://www.w3.org/TR/2005/CR-xpath-functions-20051103/#func-encode-for-uri"""
def evaluateObject(self, subj_py):
return urllib.quote(subj_py, "#!~*'()")
class BI_encodeForFragID(LightBuiltIn, Function):
"""Take a unicode string and return it encoded so as to pass in
a URI grament identifier."""
def evaluateObject(self, subj_py):
return urllib.quote(subj_py)
class BI_resolve_uri(LightBuiltIn, Function):
"""see http://www.w3.org/2006/xpath-functions#resolve-uri"""
def evaluateObject(self, subj_py):
import uripath
there, base = subj_py
return uripath.join(base, there)
# Register the string built-ins with the store
def isString(x):
# in 2.2, evidently we can test for isinstance(types.StringTypes)
return type(x) is type('') or type(x) is type(u'')
def register(store):
str = store.symbol(STRING_NS_URI[:-1])
str.internFrag("greaterThan", BI_GreaterThan)
str.internFrag("notGreaterThan", BI_NotGreaterThan)
str.internFrag("lessThan", BI_LessThan)
str.internFrag("notLessThan", BI_NotLessThan)
str.internFrag("startsWith", BI_StartsWith)
str.internFrag("endsWith", BI_EndsWith)
str.internFrag("concat", BI_concat)
str.internFrag("concatenation", BI_concatenation)
str.internFrag("scrape", BI_scrape)
str.internFrag("replace", BI_replace)
str.internFrag("search", BI_search)
str.internFrag("split", BI_split)
str.internFrag("stringToList", BI_stringToList)
str.internFrag("format", BI_format)
str.internFrag("matches", BI_matches)
str.internFrag("notMatches", BI_notMatches)
str.internFrag("contains", BI_Contains)
str.internFrag("containsIgnoringCase", BI_ContainsIgnoringCase)
str.internFrag("containsRoughly", BI_ContainsRoughly)
str.internFrag("notContainsRoughly", BI_DoesNotContainRoughly)
str.internFrag("doesNotContain", BI_DoesNotContain)
str.internFrag("equalIgnoringCase", BI_equalIgnoringCase)
str.internFrag("notEqualIgnoringCase", BI_notEqualIgnoringCase)
str.internFrag("xmlEscapeAttribute", BI_xmlEscapeAttribute)
str.internFrag("xmlEscapeData", BI_xmlEscapeData)
str.internFrag("encodeForURI", BI_encodeForURI)
str.internFrag("encodeForFragID", BI_encodeForFragID)
fn = store.symbol("http://www.w3.org/2006/xpath-functions")
fn.internFrag("resolve-uri", BI_resolve_uri)
fn.internFrag("tokenize", BI_tokenize)
fn.internFrag("normalize-space", BI_normalize_space)