-
Notifications
You must be signed in to change notification settings - Fork 2
/
queries.sql
211 lines (185 loc) · 7.02 KB
/
queries.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
-- BigQuery views and queries.
-- https://bigquery.cloud.google.com/dataset/indie-map:indiemap
--
-- I should eventually switch this to just an export of the saved views and
-- queries there, instead of duplicating them here.
-- View: canonical_pages
-- (I originally used a function for SPLIT(url, '://')[SAFE_OFFSET(1)] etc, but
-- BigQuery doesn't allow functions in views.)
WITH extras AS (
SELECT *, SPLIT(url, '://')[SAFE_OFFSET(1)] AS norm_url,
(SELECT r.urls FROM UNNEST (rels) r WHERE r.value = 'canonical') AS canonicals
FROM `indie-map.indiemap.pages`
)
SELECT *
FROM extras
WHERE ARRAY_LENGTH(u_urls) = 0 OR
norm_url IN (SELECT SPLIT(u_url, '://')[SAFE_OFFSET(1)] FROM UNNEST (u_urls) AS u_url)
OR ARRAY_LENGTH(canonicals) = 0 OR
norm_url IN (SELECT SPLIT(c_url, '://')[SAFE_OFFSET(1)] FROM UNNEST (canonicals) AS c_url)
-- View: links
WITH links AS (
SELECT
p.url AS from_url,
p.domain AS from_domain,
l.url AS to_url,
REGEXP_EXTRACT(l.url,
'^https?://(?:www\\.)?((?:twitter|facebook|plus.google)\\.com/[^/?]+)(?:/(?:posts|status(?:es)?)/?.*)?') AS to_site,
CASE
WHEN 'u-in-reply-to' IN UNNEST (l.classes) THEN 'u-in-reply-to'
WHEN 'u-repost-of' IN UNNEST (l.classes) THEN 'u-repost-of'
WHEN 'u-like-of' IN UNNEST (l.classes) THEN 'u-like-of'
WHEN 'u-favorite-of' IN UNNEST (l.classes) THEN 'u-favorite-of'
WHEN 'u-invitee' IN UNNEST (l.classes) THEN 'u-invitee'
WHEN 'u-quotation-of' IN UNNEST (l.classes) THEN 'u-quotation-of'
WHEN 'u-bookmark-of' IN UNNEST (l.classes) THEN 'u-bookmark-of'
ELSE NULL
END AS mf2_class
FROM
`indie-map.indiemap.canonical_pages` p,
p.links l
WHERE l.tag = 'a'
AND 'nofollow' NOT IN UNNEST (l.rels)
AND 'u-syndication' NOT IN UNNEST (l.classes)
)
SELECT from_url, from_domain, to_url,
CASE WHEN to_site IS NOT NULL AND NOT ENDS_WITH(to_site, '.php')
THEN to_site
ELSE NET.HOST(to_url)
END AS to_site,
mf2_class
FROM links
-- View: links_social_graph
SELECT from_domain, to_site, mf2_class, COUNT(*) num
FROM `indie-map.indiemap.links`
WHERE to_site IS NOT NULL AND to_site != from_domain
GROUP BY from_domain, to_site, mf2_class
ORDER BY from_domain, to_site, mf2_class;
-- Per site info for JSON data files. Returns incomplete results since the
-- implicit UNNESTs in the FROM clause do an inner join on the rels and
-- mf2_classes columns, so they exclude pages without those values.
SELECT
domain,
MIN(fetch_time) AS crawl_start,
MAX(fetch_time) AS crawl_end,
COUNT(*) AS num_pages,
-- Uncommenting this makes the query hit the 100MB row limit and fail. Haven't
-- found the offending row yet.
-- SUM(LENGTH(html)) as total_html_size,
ARRAY_AGG(DISTINCT mf2c IGNORE NULLS) AS mf2_classes,
-- TODO: these only look at the first endpoint on each page
ARRAY_AGG(DISTINCT (SELECT u FROM r.urls u WHERE r.value IN ('webmention', 'http://webmention.org/')
LIMIT 1) IGNORE NULLS) AS webmention_endpoints,
ARRAY_AGG(DISTINCT (SELECT u FROM r.urls u WHERE r.value IN ('micropub', 'http://micropub.net/')
LIMIT 1) IGNORE NULLS) AS micropub_endpoints,
-- different places that tell us the server: HTTP Server header,
-- meta generator, rel-generator. all are incomplete. :/
ARRAY_AGG(DISTINCT (SELECT REGEXP_REPLACE(value, '/[0-9.]+', '') FROM p.headers WHERE name = 'Server')
IGNORE NULLS) AS servers,
ARRAY_AGG(DISTINCT (SELECT u FROM r.urls u WHERE r.value = 'generator'
LIMIT 1) IGNORE NULLS) AS rel_generators,
ARRAY_AGG(DISTINCT REGEXP_REPLACE(
REGEXP_EXTRACT(
-- find meta generator tags
REGEXP_EXTRACT(html, '<meta[^>]* name="generator"[^>]*>'),
-- extract content value
'content *= *[\'"]([^\'"]+)'),
-- drop version numbers
'[ :]*[0-9.]{2,}', '')
IGNORE NULLS) AS meta_generators,
FROM indiemap.pages p, p.rels r, p.mf2_classes mf2c
GROUP BY domain;
-- Per site info for JSON data files, separated by unnest columns. First the singular columns:
-- ...then mf2_classes:
SELECT
domain,
ARRAY_AGG(DISTINCT m IGNORE NULLS) AS mf2_classes
FROM indiemap.pages p, p.mf2_classes m
GROUP BY domain;
-- ...then rels:
SELECT
domain,
-- TODO: these only look at the first endpoint on each page
ARRAY_AGG(DISTINCT (SELECT u FROM r.urls u WHERE r.value IN ('webmention', 'http://webmention.org/')
LIMIT 1) IGNORE NULLS) AS webmention_endpoints,
ARRAY_AGG(DISTINCT (SELECT u FROM r.urls u WHERE r.value IN ('micropub', 'http://micropub.net/')
LIMIT 1) IGNORE NULLS) AS micropub_endpoints,
-- rel-generator for inferring server. different places that tell us the server: HTTP Server header,
-- meta generator, rel-generator. all are incomplete. :/
ARRAY_AGG(DISTINCT (SELECT REGEXP_REPLACE(value, '/[0-9.]+', '') FROM p.headers WHERE name = 'Server')
IGNORE NULLS) AS servers,
ARRAY_AGG(DISTINCT (SELECT u FROM r.urls u WHERE r.value = 'generator'
LIMIT 1) IGNORE NULLS) AS rel_generators,
FROM indiemap.pages p, p.mf2_classes m
GROUP BY domain;
-- Rows with duplicate URLs. (Currently >750k!)
SELECT p.url, COUNT(*) c
FROM indiemap.pages p
GROUP BY p.url
HAVING c > 1
ORDER BY c DESC
-- Attempts at deleting the dupe URLs. (none work yet)
STATE: insert into new table with ROW_NUMBER() 1?
# https://stackoverflow.com/a/3822833/186123
WITH partitions AS (
SELECT ROW_NUMBER() OVER (PARTITION BY url ORDER BY fetch_time) row_num
FROM indiemap.pages
ORDER BY url
)
DELETE FROM partitions
WHERE row_num > 1;
WITH dupes AS (
SELECT domain, url, fetch_time, COUNT(*) c
FROM indiemap.pages
GROUP BY domain, url, fetch_time
HAVING c > 1
ORDER BY c DESC
)
SELECT DISTINCT domain FROM dupes
delete from indiemap.pages p
where p.url IN (
SELECT url
FROM indiemap.pages
GROUP BY url
HAVING COUNT(*) > 1
)
AND fetch_time > (SELECT MIN(fetch_time) from indiemap.pages where url = p.url)
-- Find rows with the biggest values in each column. Useful since BigQuery has
-- an undocumented 100MB limit on the amount of data processed per row in a
-- query. (These queries aren't currently saved in BigQuery.)
SELECT p.url, BYTE_LENGTH(html) len
FROM indiemap.pages p
ORDER BY len DESC
SELECT p.url, BYTE_LENGTH(mf2) len
FROM indiemap.pages p
ORDER BY len DESC
SELECT p.url, SUM(BYTE_LENGTH(u)) len
FROM indiemap.pages p, p.u_urls u
GROUP BY p.url
ORDER BY len DESC
SELECT url, SUM(BYTE_LENGTH(r.value) + BYTE_LENGTH(ARRAY_TO_STRING(r.urls, ''))) len
FROM indiemap.pages p, p.rels r
GROUP BY p.url
ORDER BY len DESC
SELECT url, SUM(BYTE_LENGTH(h.value) + BYTE_LENGTH(h.name)) len
FROM indiemap.pages p, p.headers h
GROUP BY p.url
ORDER BY len DESC
SELECT p.url, SUM(
BYTE_LENGTH(l.inner_html) +
BYTE_LENGTH(l.url) +
BYTE_LENGTH(ARRAY_TO_STRING(l.classes, '')) +
BYTE_LENGTH(ARRAY_TO_STRING(l.rels, ''))
) len
FROM indiemap.pages p, p.links l
GROUP BY p.url
ORDER BY len DESC
SELECT p.url, SUM(BYTE_LENGTH(m)) len
FROM indiemap.pages p, p.mf2_classes m
GROUP BY p.url
ORDER BY len DESC
-- Alternative: run this (in tcsh) over the per site json.gz files...but it
-- takes *forever*.
-- foreach f (*.json.gz)
-- echo `gzcat $f | wc -L` $f
-- end