-
Notifications
You must be signed in to change notification settings - Fork 3
/
isQaId.c
204 lines (158 loc) · 5.63 KB
/
isQaId.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
/*
isQaID: Check if a row from the StackOverflow data dump is in a list of answered questions or accepted answers
Peter Burkimsher
2014-02-10
To compile:
cc -lm isQaId.c -o isQaId
*/
#include <stdio.h> /* Standard input/output definitions */
#include <string.h> /* String function definitions */
#include <stdlib.h> /* Standard library */
#define XML_LINE 100000 /* Number of characters in one row */
#define ARRAYLENGTH 10292419 /* Number of question or answer IDs */
/*
textBetween: Get the text between two delimiters
Properties
char* thisText: The text to search within
char* startText: The starting delimiter
char* endText: The ending delimiter
char* returnText: A string to copy the result to
Return
char* startPointer: NULL is thisText doesn't contain startText
*/
char* textBetween(char* thisText, char* startText, char* endText, char* returnText)
{
char* startPointer = NULL;
int stringLength = 0;
startPointer = strstr(thisText, startText);
// fprintf( stdout, "startPointer: %s\n", startPointer); fflush(stdout);
if (startPointer != NULL)
{
startPointer = startPointer + strlen(startText);
stringLength = strlen(startPointer) - (int)strlen(strstr(startPointer,endText));
// Copy characters between the start and end delimiters
strncpy(returnText,startPointer, stringLength);
returnText[stringLength++] = '\0';
}
return startPointer;
}
/*
textBetweenInclusive: Get the text between two delimiters, including the delimiters
Properties
char* thisText: The text to search within
char* startText: The starting delimiter
char* endText: The ending delimiter
char* returnText: A string to copy the result to
Return
char* startPointer: NULL is thisText doesn't contain startText
*/
char* textBetweenInclusive(char* thisText, char* startText, char* endText, char* returnText)
{
char* startPointer = NULL;
int stringLength = 0;
startPointer = strstr(thisText, startText);
if (startPointer != NULL)
{
// startPointer = startPointer + strlen(startText);
stringLength = strlen(startPointer) - (int)strlen(strstr(startPointer,endText));
stringLength = stringLength + strlen(endText);
// Copy characters between the start and end delimiters
strncpy(returnText,startPointer, stringLength);
returnText[stringLength++] = '\0';
}
return startPointer;
}
/*
main: Checks rows to see if the ID of the row is in the qaIds.txt file
Reads rows from stdin. cuts out the ID, and checks it against the pre-read qaIds list.
*/
int main(int argc, char** argv)
{
char soBufferArray[XML_LINE]; /* Line buffer for reading StackOverflow file */
char* soBuffer;
// Strings for the post Id, row, body, and answer ID
char* postIdString;
char postIdArray[XML_LINE];
char* rowString;
char rowArray[XML_LINE];
char* bodyString;
char bodyArray[XML_LINE];
char* answerIdString;
char answerIdArray[XML_LINE];
// Integers for the post ID and answer ID
int postId=0;
int answerId=0;
// The start pointer for the textbetween subroutine
char* startPointer = NULL;
// A static array, because a normal int array maxes out at about 1 million elements due to heap constraints.
static int qaIds[ARRAYLENGTH];
// The offset of the current ID in the qaIDs list
int currentId=0;
// The number of IDs in the qaIDs list
int numberIds;
// The value of the current ID from the qaIDs list
int thisId;
// Initialise arrays
soBuffer = soBufferArray;
postIdString = postIdArray;
rowString = rowArray;
bodyString = bodyArray;
answerIdString = answerIdArray;
// Read the QA IDs file into the qaIDs integer array
FILE *file = fopen("qaIds.txt", "r");
// Read lines as integers
while(fscanf(file, "%d", &thisId) > 0)
{
// Append the ID to the qaIDs array
qaIds[currentId] = thisId;
// Increment the pointer
currentId++;
}
// Don't forget to close the file!
fclose(file);
// Set the number of IDs
numberIds=currentId;
// Reset the current ID variable
currentId=0;
// Read a line from stdin.
while (fgets(soBuffer, XML_LINE, stdin) != NULL)
{
// Read the row ID
startPointer = textBetween(soBuffer, "row Id=\"", "\"", postIdString);
postId = atoi(postIdString);
// Move forward in the IDs array until the post ID is found, or not
while (qaIds[currentId] < postId)
{
currentId++;
}
// Prevent reading past the end of the array
if (currentId >= numberIds)
{
currentId=0;
}
// If the post ID is found, print the row
if (qaIds[currentId] == postId)
{
// Read the row
startPointer = textBetweenInclusive(soBuffer, "<row Id=\"", "/>", rowString);
// Read the body text
startPointer = textBetween(rowString, " Body=\"", "\"", bodyString);
// Read the accepted answer ID
startPointer = textBetween(rowString, " AcceptedAnswerId=\"", "\"", answerIdString);
answerId = atoi(answerIdString);
// Print the row
// fprintf( stdout, "%s\n", rowString); fflush(stdout);
// Print a compressed version of the row
fprintf( stdout, "<<%d<>%d><%s>>\n", postId, answerId, bodyString); fflush(stdout);
// Clear the post and answer IDs
postId=0;
answerId=0;
// Clear the strings
strcpy(rowString, "");
strcpy(bodyString, "");
strcpy(answerIdString, "");
strcpy(postIdString, "");
}
} // end while reading from stdin
}