-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract.js
92 lines (82 loc) · 2.5 KB
/
extract.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
var _ = require('underscore'),
tesseract = require('node-tesseract'),
fs = require('fs');
var Extractor = function(){
this.filesPath = __dirname + '/documents/';
this.exportPath = __dirname + '/data_extracted/';
}
Extractor.prototype.process = function(callback) {
var _this = this,
_callback = callback;
fs.readdir(_this.filesPath, function(err, files){
if(err) {
console.log('Error while reading files.');
console.log(err);
_callback();
} else {
var filesCount = files.length;
_this.extract(0, files, function(){
_callback();
});
}
});
}
Extractor.prototype.extract = function(count, files, callback) {
var _this = this,
_count = count,
_files = files,
_callback = callback,
filePath = _this.filesPath + _files[_count];
tesseract.process(filePath, function(err, text){
console.log('Processing ' + filePath);
if(err) {
console.log('Error while processing.');
console.log(err);
_callback();
} else {
//console.log(text);
var exportedName = _this.exportPath + Date.now() + '-' + _files[_count];
_this.isReceipt(text, function(isReceipt){
if(isReceipt) {
console.log('It\'s a receipt!');
exportedName = exportedName + '_receipt';
} else {
console.log('I\'m not saying that this is NOT a receipt, but I couldn\'t identify it as such.')
}
fs.writeFile(exportedName + '.txt', text, function(err){
if(err) {
console.log(err);
console.log('Error while exporting data for' + _files[_count] + '\n');
} else {
console.log('Data exported to: ' + exportedName + '.txt' + '\n');
}
if(_count === _files.length-1) {
_callback();
} else {
_count++;
_this.extract(_count, _files, _callback);
}
});
});
}
});
};
Extractor.prototype.isReceipt = function(text, callback) {
var _this = this,
_text = text,
_callback = callback,
// meaning: words that we should look for in a receipt (regex)
receiptIdentifiers = [/receipt/i, /total/i],
isReceipt = false;
_.each(receiptIdentifiers, function(identifier){
var textMatch = _text.match(identifier);
if(typeof textMatch !== "undefined" && textMatch !== null) {
isReceipt = true;
}
});
_callback(isReceipt);
}
var extractor = new Extractor();
extractor.process(function(){
console.log('DONE!');
});