File Reader | Add line_bytes_offset

romayalon · romayalon · commit b223ddb6812e · 2025-04-09T14:56:58.000+03:00
Signed-off-by: Romy &lt;35330373+romayalon@users.noreply.github.com&gt;
diff --git a/src/test/unit_tests/jest_tests/test_newline_reader.test.js b/src/test/unit_tests/jest_tests/test_newline_reader.test.js
@@ -51,6 +51,26 @@ describe('newline_reader', () => {
 			expect(result).toStrictEqual(UTF8DATA_ARR);
 		});
 
+		it('next_line_file_offset - can process utf8 characters when termination with newline character', async () => {
+			const UTF8DATA_BUF = Buffer.from(UTF8DATA_ARR.join('\n') + '\n', 'utf8');
+
+			const reader = new NewlineReader({}, '', { skip_leftover_line: true, read_file_offset: 0 });
+			// @ts-ignore
+			reader.fh = mocked_file_handler(UTF8DATA_BUF);
+
+			const result = [];
+			let expected_cur_next_line_file_offset = 0;
+			const [processed] = await reader.forEach(async entry => {
+				result.push(entry);
+				expected_cur_next_line_file_offset += Buffer.byteLength(entry, 'utf8') + 1;
+				expect(reader.next_line_file_offset).toBe(expected_cur_next_line_file_offset);
+				return true;
+			});
+
+			expect(processed).toBe(UTF8DATA_ARR.length);
+			expect(result).toStrictEqual(UTF8DATA_ARR);
+		});
+
 		it('can process utf8 characters when termination not with new line character', async () => {
 			const UTF8DATA_BUF = Buffer.from(UTF8DATA_ARR.join('\n'), 'utf8');
 
@@ -68,6 +88,47 @@ describe('newline_reader', () => {
 			expect(result).toStrictEqual(UTF8DATA_ARR);
 		});
 
+		it('next_line_file_offset - can process utf8 characters when termination not with new line character', async () => {
+			const UTF8DATA_BUF = Buffer.from(UTF8DATA_ARR.join('\n'), 'utf8');
+
+			const reader = new NewlineReader({}, '', { read_file_offset: 0 });
+			// @ts-ignore
+			reader.fh = mocked_file_handler(UTF8DATA_BUF);
+
+			const result = [];
+			let expected_cur_next_line_file_offset = 0;
+			const [processed] = await reader.forEach(async entry => {
+				result.push(entry);
+				expected_cur_next_line_file_offset += Buffer.byteLength(entry, 'utf8') + (reader.eof ? 0 : 1);
+				expect(reader.next_line_file_offset).toBe(expected_cur_next_line_file_offset);
+				return true;
+			});
+
+			expect(processed).toBe(UTF8DATA_ARR.length);
+			expect(result).toStrictEqual(UTF8DATA_ARR);
+		});
+
+		it('next_line_file_offset starts from the second line - can process utf8 characters when termination not with new line character', async () => {
+			const UTF8DATA_BUF = Buffer.from(UTF8DATA_ARR.join('\n'), 'utf8');
+			const expected_to_be_processed_data_array = UTF8DATA_ARR.slice(1);
+			const initial_next_line_file_offset = Buffer.byteLength(UTF8DATA_ARR[0], 'utf8') + 1;
+			const reader = new NewlineReader({}, '', { read_file_offset: initial_next_line_file_offset});
+			// @ts-ignore
+			reader.fh = mocked_file_handler(UTF8DATA_BUF);
+
+			const result = [];
+			let expected_cur_next_line_file_offset = initial_next_line_file_offset;
+			const [processed] = await reader.forEach(async entry => {
+				result.push(entry);
+				expected_cur_next_line_file_offset += Buffer.byteLength(entry, 'utf8') + (reader.eof ? 0 : 1);
+				expect(reader.next_line_file_offset).toBe(expected_cur_next_line_file_offset);
+				return true;
+			});
+
+			expect(processed).toBe(expected_to_be_processed_data_array.length);
+			expect(result).toStrictEqual(expected_to_be_processed_data_array);
+		});
+
 		it('can process utf8 characters when termination not with new line character [bufsize = 4]', async () => {
 			const expected = "abc";
 			const UTF8DATA_ARR_TEMP = [ ...UTF8DATA_ARR, expected ];
@@ -86,5 +147,26 @@ describe('newline_reader', () => {
 			expect(processed).toBe(1);
 			expect(result).toStrictEqual([expected]);
 		});
+
+		it('next_line_file_offset - can process utf8 characters when termination not with new line character [bufsize = 4]', async () => {
+			const expected = "abc";
+			const UTF8DATA_ARR_TEMP = [ ...UTF8DATA_ARR, expected ];
+			const UTF8DATA_BUF = Buffer.from(UTF8DATA_ARR_TEMP.join('\n'), 'utf8');
+
+			const reader = new NewlineReader({}, '', { bufsize: 256, skip_overflow_lines: true, read_file_offset: 0 });
+			// @ts-ignore
+			reader.fh = mocked_file_handler(UTF8DATA_BUF);
+
+			const result = [];
+			const [processed] = await reader.forEach(async entry => {
+				result.push(entry);
+				return true;
+			});
+
+			expect(processed).toBe(1);
+			expect(result).toStrictEqual([expected]);
+			const expected_cur_next_line_file_offset = UTF8DATA_BUF.length;
+			expect(reader.next_line_file_offset).toBe(expected_cur_next_line_file_offset);
+		});
 	});
 });
diff --git a/src/util/file_reader.js b/src/util/file_reader.js
@@ -30,6 +30,7 @@ class NewlineReader {
      *  bufsize?: number;
      *  skip_leftover_line?: boolean;
      *  skip_overflow_lines?: boolean;
+     *  read_file_offset?: number;
      * }} [cfg]
      **/
     constructor(fs_context, filepath, cfg) {
@@ -41,18 +42,19 @@ class NewlineReader {
         this.fs_context = fs_context;
         this.fh = null;
         this.eof = false;
-        this.readoffset = 0;
+        this.read_file_offset = cfg?.read_file_offset || 0;
 
         this.buf = Buffer.alloc(cfg?.bufsize || 64 * 1024);
         this.start = 0;
         this.end = 0;
         this.overflow_state = false;
+        this.next_line_file_offset = cfg?.read_file_offset || 0;
     }
 
     info() {
         return {
             path: this.path,
-            read_offset: this.readoffset,
+            read_offset: this.read_file_offset,
             overflow_state: this.overflow_state,
             start: this.start,
             end: this.end,
@@ -78,9 +80,9 @@ class NewlineReader {
                         this.start += term_idx + 1;
                         continue;
                     }
-
                     const line = this.buf.toString('utf8', this.start, this.start + term_idx);
                     this.start += term_idx + 1;
+                    this.next_line_file_offset = this.read_file_offset - (this.end - this.start);
                     return line;
                 }
             }
@@ -106,7 +108,7 @@ class NewlineReader {
 
             // read from file
             const avail = this.buf.length - this.end;
-            const read = await this.fh.read(this.fs_context, this.buf, this.end, avail, this.readoffset);
+            const read = await this.fh.read(this.fs_context, this.buf, this.end, avail, this.read_file_offset);
             if (!read) {
                 this.eof = true;
 
@@ -118,13 +120,15 @@ class NewlineReader {
                         console.warn('line too long finally terminated at eof:', this.info());
                     } else {
                         const line = this.buf.toString('utf8', this.start, this.end);
+                        this.start = this.end;
+                        this.next_line_file_offset = this.read_file_offset;
                         return line;
                     }
                 }
 
                 return null;
             }
-            this.readoffset += read;
+            this.read_file_offset += read;
             this.end += read;
         }
 
@@ -169,7 +173,7 @@ class NewlineReader {
     // was moved, this will still keep on reading from the previous FD.
     reset() {
         this.eof = false;
-        this.readoffset = 0;
+        this.read_file_offset = 0;
         this.start = 0;
         this.end = 0;
         this.overflow_state = false;