forked from archlinux/archmanweb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert_txt.pgc
223 lines (182 loc) · 5.95 KB
/
convert_txt.pgc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
// vim: ft=c
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <unistd.h> // fork, dup2, execlp
#include <sys/wait.h> // waitpid
// reference: https://www.postgresql.org/docs/current/static/ecpg.html
// failsafe against SQL errors
EXEC SQL WHENEVER SQLWARNING SQLPRINT;
EXEC SQL WHENEVER SQLERROR STOP;
EXEC SQL BEGIN DECLARE SECTION;
char* db_target = NULL;
char* db_user = NULL;
char* db_password = NULL;
EXEC SQL END DECLARE SECTION;
void die(int status)
{
EXEC SQL ROLLBACK;
EXEC SQL DISCONNECT ALL;
exit(status);
}
bool run_mandoc(int raw_length, char* raw, int* txt_length, char** txt, size_t* txt_allocated)
{
bool ret = false;
// pipes for the communication
int pipeIn[2];
int pipeOut[2];
int pipeErr[2];
pipe(pipeIn);
pipe(pipeOut);
pipe(pipeErr);
pid_t cpid = fork();
if (cpid == -1) {
perror("fork");
exit(1);
}
if (cpid == 0) // if I am the child then
{
// close the unused sides of the pipes
close(pipeIn[1]);
close(pipeOut[0]);
close(pipeErr[0]);
// redirect std* to the pipes
dup2(pipeIn[0], STDIN_FILENO);
dup2(pipeOut[1], STDOUT_FILENO);
dup2(pipeErr[1], STDERR_FILENO);
execlp("mandoc", "mandoc", "-Tutf8", NULL);
}
else // if I am the parent then
{
// close the unused sides of the pipes
close(pipeIn[0]);
close(pipeOut[1]);
close(pipeErr[1]);
// send the content to the child's stdin
write(pipeIn[1], raw, raw_length);
// close the write-end of the pipe, thus sending EOF to the reader
close(pipeIn[1]);
// read the mandoc's output
*txt_length = 0;
const int bufsize = 4096;
const int alloc_chunksize = 65536;
char buffer[bufsize];
int n_read;
while ( (n_read = read(pipeOut[0], buffer, bufsize)) > 0) {
// >= is important to reserve space for the terminating \0
if (*txt_length + n_read >= *txt_allocated) {
char* new_content = realloc(*txt, *txt_allocated + alloc_chunksize);
if (new_content == NULL) {
free(*txt);
perror("realloc");
die(1);
}
*txt = new_content;
*txt_allocated += alloc_chunksize;
}
memcpy(*txt + *txt_length, buffer, n_read);
*txt_length += n_read;
}
(*txt)[*txt_length] = '\0';
// wait for the child to exit
// (just in case, it should exit right after we read the EOF)
int status;
if (waitpid(cpid, &status, 0) == -1) {
perror("waitpid");
die(1);
}
// close the read-end of the pipes
close(pipeOut[0]);
close(pipeErr[0]);
ret = status == 0;
}
return ret;
}
void convert()
{
EXEC SQL CONNECT TO :db_target AS main_connection USER :db_user USING :db_password;
// if there are multiple connections: EXEC SQL SET CONNECTION connection-name;
// or EXEC SQL AT connection-name SELECT ...;
// NOTE: .so elimination is not necessary for full-text search, it would
// only introduce duplicate results. Therefore we skip all manuals
// containing the .so macro right in the SQL query.
EXEC SQL DECLARE content CURSOR FOR
SELECT id, octet_length(raw) FROM archweb_manpages_content
WHERE txt IS NULL AND raw !~ '(?n)^\.so .*$'
ORDER BY id;
EXEC SQL OPEN content;
EXEC SQL PREPARE select_raw FROM "SELECT raw FROM archweb_manpages_content WHERE id = ?";
EXEC SQL PREPARE update_txt FROM "UPDATE archweb_manpages_content SET txt = regexp_replace(?, '.\b', '', 'g') WHERE id = ?";
EXEC SQL BEGIN DECLARE SECTION;
int id;
int raw_length;
char* raw = NULL;
char* txt = NULL;
EXEC SQL END DECLARE SECTION;
size_t allocated_raw = 0;
size_t allocated_txt = 0;
// when end of result set reached, break out of while loop
EXEC SQL WHENEVER NOT FOUND DO BREAK;
while (1) {
EXEC SQL FETCH content INTO :id, :raw_length;
printf("%d, %d bytes \n", id, raw_length);
if (allocated_raw < raw_length + 1) {
char* new_content = realloc(raw, raw_length + 1);
if (new_content == NULL) {
free(raw);
perror("realloc");
die(1);
}
raw = new_content;
allocated_raw = raw_length + 1;
}
EXEC SQL EXECUTE select_raw INTO :raw USING :id;
int txt_length = 0;
bool success = run_mandoc(raw_length, raw, &txt_length, &txt, &allocated_txt);
if (success)
EXEC SQL EXECUTE update_txt USING :txt, :id;
}
free(raw);
free(txt);
EXEC SQL CLOSE content;
EXEC SQL DEALLOCATE PREPARE select_raw;
EXEC SQL DEALLOCATE PREPARE update_txt;
EXEC SQL COMMIT;
EXEC SQL DISCONNECT ALL;
}
int main(int argc, char* argv[])
{
static struct option long_options[] = {
{"target", required_argument, 0, 't'},
{"user", required_argument, 0, 'u'},
{"password", required_argument, 0, 'p'},
{0, 0, 0, 0}
};
int option_index = 0;
while (1) {
int opt = getopt_long(argc, argv, "", long_options, &option_index);
// end of options
if (opt == -1)
break;
switch (opt)
{
case 't':
db_target = optarg;
break;
case 'u':
db_user = optarg;
break;
case 'p':
db_password = optarg;
break;
default:
exit(1);
}
}
if (db_target == NULL || db_user == NULL || db_password == NULL) {
fprintf(stderr, "usage: %s --target <dbname@host> --user <username> --password <password>\n", argv[0]);
exit(1);
}
convert();
return 0;
}