diff --git a/mfr/extensions/tabular/libs/stdlib_tools.py b/mfr/extensions/tabular/libs/stdlib_tools.py index 3d3611984..f82007933 100644 --- a/mfr/extensions/tabular/libs/stdlib_tools.py +++ b/mfr/extensions/tabular/libs/stdlib_tools.py @@ -1,18 +1,32 @@ import re import csv +import logging -from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError from mfr.extensions.tabular import utilities +from mfr.extensions.tabular.settings import MAX_FILE_SIZE, INIT_SNIFF_SIZE +from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError + +logger = logging.getLogger(__name__) def csv_stdlib(fp): """Read and convert a csv file to JSON format using the python standard library - :param fp: File pointer object - :return: tuple of table headers and data + + Quirk: ``csv.Sniffer().sniff()`` needs the FULL first row and ONLY one full row to be able to + effectively detect the correct dialect of the file. + + :param fp: the file pointer object + :return: a tuple of table headers and data """ - data = fp.read(2048) + + # Prepare the first row for sniffing + data = fp.read(INIT_SNIFF_SIZE) + data = _trim_or_append_data(fp, data, INIT_SNIFF_SIZE, 0) + + # Reset the file pointer fp.seek(0) + # Sniff the first row to find a matching format try: dialect = csv.Sniffer().sniff(data) except csv.Error: @@ -20,10 +34,14 @@ def csv_stdlib(fp): else: _set_dialect_quote_attrs(dialect, data) + # Explicitly delete data when it is on longer used. del data + + # Create the CSV reader with the detected dialect reader = csv.DictReader(fp, dialect=dialect) + + # Update the reader field names to avoid duplicate column names when performing row extraction columns = [] - # update the reader field names to avoid duplicate column names when performing row extraction for idx, fieldname in enumerate(reader.fieldnames or []): column_count = sum(1 for column in columns if fieldname == column['name']) if column_count: @@ -92,3 +110,63 @@ def _set_dialect_quote_attrs(dialect, data): dialect.quotechar = '"' if re.search('"""[[({]\'.+\',', data): dialect.doublequote = True + + +def _trim_or_append_data(fp, text, read_size, size_to_sniff, max_render_size=MAX_FILE_SIZE): + """Recursively read data from a file and return its full first row. The file starts with + ``text`` and the file pointer points to the next character immediately after `text`. + + :param fp: the file pointer from which data is read + :param text: the current text chunk to check the new line character + :param read_size: the last read size when `fp.read()` is called + :param size_to_sniff: the accumulated size fo the text to sniff + :param max_render_size: the max file size for render + :return: the first row of the file in string + """ + + # Return on empty text. This handles the corner case where the CSV is empty or only contains + # one line without any new line characters. + if len(text) == 0: + return '' + + # Try to find the first new line character in the text chunk + index = _find_new_line(text) + # If found, return the trimmed substring + if index != -1: + return text[:index] + # Otherwise, update `sniff_size` and then sniff more (2 times of the last `read_size`) text + size_to_sniff += read_size + read_size *= 2 + more_text = fp.read(read_size) + + # If text to sniff now goes over the max file size limit, raise the renderer error since there + # is no need to sniff when the file is already too large to be rendered. + if size_to_sniff + len(more_text) >= max_render_size: + raise TabularRendererError( + 'The first row of this file is too large for the sniffer to detect the dialect. ' + 'Please download and view it locally.', + code=400, + extension='csv' + ) + # If the size is still within the limit, recursively check `more_text` + return text + _trim_or_append_data(fp, more_text, read_size, size_to_sniff, + max_render_size=max_render_size) + + +def _find_new_line(text): + """In the given text string, find the index of the first occurrence of any of the three types + of new line character. Note: '\n\r' is not a new line character but two, one LF and one CR. + + 1. \r\n Carriage Return (CR) and Line Feed (LF), must be checked first + 2. \n LF + 3. \r CR + + :param text: the text string to check + :return: the index of the first new line character if found. Otherwise, return -1. + """ + index = text.find('\r\n') + if index == -1: + index = text.find('\n') + if index == -1: + index = text.find('\r') + return index diff --git a/mfr/extensions/tabular/settings.py b/mfr/extensions/tabular/settings.py index af6b9b0a2..158bdd00a 100644 --- a/mfr/extensions/tabular/settings.py +++ b/mfr/extensions/tabular/settings.py @@ -4,11 +4,13 @@ config = settings.child('TABULAR_EXTENSION_CONFIG') -MAX_FILE_SIZE = int(config.get('MAX_FILE_SIZE', 10 * 1024 * 1024)) # 10Mb +MAX_FILE_SIZE = int(config.get('MAX_FILE_SIZE', 10 * 1024 * 1024)) # 10MB MAX_SIZE = int(config.get('MAX_SIZE', 10000)) # max number of rows or columns allowed. TABLE_WIDTH = int(config.get('TABLE_WIDTH', 700)) TABLE_HEIGHT = int(config.get('TABLE_HEIGHT', 600)) +INIT_SNIFF_SIZE = int(config.get('INIT_SNIFF_SIZE', 2 * 1024)) # 2KB + LIBS = config.get_object('LIBS', { '.csv': [libs.csv_stdlib], '.tsv': [libs.csv_stdlib], diff --git a/tests/extensions/tabular/files/invalid.csv b/tests/extensions/tabular/files/empty.csv similarity index 100% rename from tests/extensions/tabular/files/invalid.csv rename to tests/extensions/tabular/files/empty.csv diff --git a/tests/extensions/tabular/files/long_row.csv b/tests/extensions/tabular/files/long_row.csv new file mode 100644 index 000000000..57bb90388 --- /dev/null +++ b/tests/extensions/tabular/files/long_row.csv @@ -0,0 +1 @@ + paper_id,pdf_filename,original_materials_link,original_poweranalysis_link,coded_claim3b,coded_claim4,original_statistic_analysis_type,original_samplesize_calculation_contributor,original_analytic_sample_size_units_reported,original_analytic_sample_size_value_reported,original_statistic_fulltext_reported,original_statistic_effect_type_reported,original_statistic_type_reported,original_statistic_value_reported,original_statistic_df1_reported,original_statistic_df2_reported,original_coefficient_type_reported,original_coefficient_value_reported,original_coefficient_se_reported,original_p_value_type_reported,original_p_value_tails_reported,original_p_value_value_reported,original_effect_size_fulltext_reported,original_effect_size_type_reported,original_effect_size_value_reported,original_analytic_sample_size_value_reference,original_statistic_fulltext_reference,original_statistic_effect_type_reference,original_statistic_type_reference,original_statistic_value_reference,original_statistic_df1_reference,original_statistic_df2_reference,original_coefficient_type_reference,original_coefficient_value_reference,original_coefficient_se_reference,original_p_value_tails_reference,original_p_value_value_reference,original_effect_size_type_reference,original_effect_size_value_reference,rr_stage1_analytic_sample_size,rr_stage2_analytic_sample_size,original_poweranalysis_notes,original_statsteam_contributor,original_es_ub_ci_nativeunits,original_es_lb_ci_nativeunits,original_pearsons_r_defined,original_pearsons_r_numeric,original_es_ub_ci_pearson,original_es_lb_ci_pearson,original_power_small,original_power_medium,original_power_50_original_effect,original_power_75_original_effect,original_statsteam_notes,to_do,priority,stats_rownum eOQm,Mason_JournExPsychGen_2012_eOQm,https://osf.io/zmdyg,https://osf.io/42e5k/,"a 2 (block: progressive, stagnant) X 2 (scale valence: positive, negative) repeated-measures ANOVA on participants’ responses to the positive and negative affect scales following the main experimental exposure. The critical manipulation was between lists of words constructed as either stagnant (a target word, and 11 items with a high probability of being mentioned by previous participants as a response to that word, e.g”lettuce, tomato, green, vegetable…“, or progressive (which were constructed by choosing words across association matrices, e.g.”salt, pepper, cat, dog…“)","A 2 (block: progressive, stagnant) X 2 (scale valence: positive, negative) repeated-measures ANOVA on participants’ mood ratings yielded a significant interaction, F(1, 76) = 6.63, p = .01, partial eta squared = .08",ANOVA,Melissa Kline,participants,77,"F(1, 76) = 6.63",interaction,F,6.63,1,76,NC,NC,NC,exact,NC,0.01,partial eta squared = .08,partial_eta_squared,0.08,,"F(1, 76) = 6.63",interaction,F,6.63,1,76,,,,,0.01,cohen_f_squared,0.0872368,218,485,"This is the first 'extended' original-paper stats set, used to test the workflow. As such, we hope it is a straightforward, within-subjects ANOVA! Note that the power calculations were performed on the statistics as reported, so these (a) are copied over into the 'recalculated' fields since they are the values used in the power analysis and (b) they have not actually been checked for internal consistency.",,,,,,,,,,,,,to_review,1,1 z106,Pappu_JournAcaMarkSci_2014_yDyG_z106,https://osf.io/3urx9/?view_only=e0dda2b5a95540c4b053dfd87752574c,https://osf.io/hmbzp/,"We tested the hypotheses using structural equation modeling. The predictors were the sponsor (dummy coded), sponsorship relationship fit, sponsor-sponsee similarity, and their interaction. The outcome variable was participant intention to support the nonprofit, and the mediating variables were nonprofit clarity of positioning and the participant�s attitude toward the sponsorship | In Model B, in addition to estimating the main effects, we estimated a fit by similarity interaction on intent to support the nonprofit, through the mediator of clarity of positioning.","As predicted, fit and similarity jointly affected participants� intention to support the nonprofit (b = .10, p = .019) via clarity of positioning, as seen in Model B",SEM,Sam Field,participants,189,"b = .10, p = .019",interaction,NC,NC,NC,NC,NC,0.1,NC,exact,NC,0.019,NC,NC,NC,189,"coefficient = 0.10, p = 0.019",,,,,,path coefficient,0.1,0.04822441,one-tailed,0.019,,,821,1840,This replication has finished data collection; RR statistics aren't available yet but expected to be soon,,,,,,,,,,,,,to_do,1,2 R8RN,Usta_JournMarketRes_2011_R8RN,https://osf.io/ks7ej/?view_only=5115343bd65c42639eb1fa8606d673d7,https://osf.io/mknax/,"Participants were randomly assigned to one of two experimental conditions: They were reminded of either a past decision that they delegated or one that they made independently, in both conditions by being asked to briefly describe the decision and then describe the thoughts and feelings that experiencing the decision caused them to have. Participants were also shown pictures of two desserts (a chocolate cake and a fruit salad) and asked to indicate which one they would eat if they had the two desserts in front of them right then. The measure of self-regulatory resource depletion was participants’ choice of dessert, which was analyzed with a logistic regression.","A logistic regression analysis showed that the choice share of the chocolate cake among those who had been instructed to recall a delegated decision (70.6%) was significantly greater than among participants who had been asked to recall an independent decision (31.6%; χ2(1, 36) = 5.46, p < .03)",logistic regression,Chris Aberson,participants,36,"chi squared(1, 36) = 5.46, p < .03",main effect,chi-square,5.46,1,36,NC,NC,NC,less-than,NC,0.03,NC,NC,NC,,,,,,,,,,,two-tailed,,log_odds_ratio,1.648251,119,263,"A regression powered using log odds; sample size projections conducted by Chris A. using Gpower ",,,,,,,,,,,,,to_do,2,3 GXEW,Srinivasan_Cognition_2010_GXEW,https://osf.io/acmz2/?view_only=79221d82b98146808721d4d43c5e2ea3,https://osf.io/6gtwn/,"A 2 X 2 X 2 mixed-factor repeated-measures ANOVA examined the effects of the between-subjects factor of group (congruent or incongruent tone/line pairings), and the within-subjects factors of test trial type (novel or familiar) and difficulty (easy or difficult) on the participants� familiarity ratings","A mixed-factor repeated-measures ANOVA on Experiment 3 participants� ratings of item familiarity (with factors for congruent/incongruent condition group, novel vs.�familiar trial type, and easy vs.�difficult trial type) revealed a �group X novelty interaction, F(1, 32) = 4.38,p < .05, indicating that participants in the congruent group better distinguished the familiar from novel pairings than participants in the incongruent group",ANOVA,NC,participants,37,"F(1, 32) = 4.38,p < .05",interaction,F,4.38,1,32,NC,NC,NC,less-than,NC,0.05,NC,NC,NC,34,"F(1,32) = 4.38",,F,4.38,1,32,,,,,,,0.136875,141,311,"A more complex ANOVA case; stats team please review the power analysis to get additional 'reference' values - here and in other cases, these are left blank if I didn't feel certain of what something was or how calculated ",,,,,,,,,,,,,to_do,2,4 \ No newline at end of file diff --git a/tests/extensions/tabular/files/one_line.csv b/tests/extensions/tabular/files/one_line.csv new file mode 100644 index 000000000..92a94bb0a --- /dev/null +++ b/tests/extensions/tabular/files/one_line.csv @@ -0,0 +1 @@ +one, line, csv \ No newline at end of file diff --git a/tests/extensions/tabular/test_renderer.py b/tests/extensions/tabular/test_renderer.py index 5b7049fcc..4a4b23702 100644 --- a/tests/extensions/tabular/test_renderer.py +++ b/tests/extensions/tabular/test_renderer.py @@ -25,8 +25,18 @@ def test_csv_file_path(): @pytest.fixture -def invalid_csv_file_path(): - return os.path.join(PATH, 'files', 'invalid.csv') +def empty_csv_file_path(): + return os.path.join(PATH, 'files', 'empty.csv') + + +@pytest.fixture +def one_line_csv_file_path(): + return os.path.join(PATH, 'files', 'one_line.csv') + + +@pytest.fixture +def long_row_csv_file_path(): + return os.path.join(PATH, 'files', 'long_row.csv') @pytest.fixture @@ -107,10 +117,26 @@ def test_render_tabular_csv(self, test_csv_file_path, assets_url, export_url): body = renderer.render() assert BODY in body - def test_render_tabular_csv_invalid(self, invalid_csv_file_path, assets_url, export_url): + def test_render_tabular_csv_one_line(self, one_line_csv_file_path, assets_url, export_url): + metadata = ProviderMetadata('test', '.csv', 'text/plain', '1234', + 'http://wb.osf.io/file/test.csv?token=1234') + renderer = TabularRenderer(metadata, one_line_csv_file_path, url, assets_url, export_url) + body = renderer.render() + assert BODY in body + + def test_render_tabular_csv_long_row(self, long_row_csv_file_path, assets_url, export_url): + metadata = ProviderMetadata('test', '.csv', 'text/plain', '1234', + 'http://wb.osf.io/file/test.csv?token=1234') + renderer = TabularRenderer(metadata, long_row_csv_file_path, url, assets_url, export_url) + body = renderer.render() + assert BODY in body + + # There is way to test an "invalid" CSV unless the file is empty since the CSV sniffer attempts + # to make the best guess syntactically even if the file is semantically invalid. + def test_render_tabular_csv_empty(self, empty_csv_file_path, assets_url, export_url): metadata = ProviderMetadata('test', '.csv', 'text/plain', '1234', 'http://wb.osf.io/file/test.csv?token=1234') - renderer = TabularRenderer(metadata, invalid_csv_file_path, url, assets_url, export_url) + renderer = TabularRenderer(metadata, empty_csv_file_path, url, assets_url, export_url) with pytest.raises(exceptions.EmptyTableError): renderer.render() diff --git a/tests/extensions/tabular/test_stdlib_tools.py b/tests/extensions/tabular/test_stdlib_tools.py new file mode 100644 index 000000000..4700893f5 --- /dev/null +++ b/tests/extensions/tabular/test_stdlib_tools.py @@ -0,0 +1,132 @@ +import logging +from io import StringIO +from random import choice +from string import ascii_letters + +import pytest + +from mfr.extensions.tabular.exceptions import TabularRendererError +from mfr.extensions.tabular.libs.stdlib_tools import _find_new_line, _trim_or_append_data + +logger = logging.getLogger(__name__) + +INIT_SNIFF_SIZE = 128 +MAX_RENDER_SIZE = INIT_SNIFF_SIZE * 8 + + +@pytest.fixture +def text_with_lf(): + return 'text_with_lf\nanother_row\na_third_row' + + +@pytest.fixture +def text_with_cr(): + return 'text_with_cr\ranother_row\ra_third_row' + + +@pytest.fixture +def text_with_cr_lf(): + return 'text_with_cr_lf\r\nanother_row\r\na_third_row' + + +@pytest.fixture +def text_without_new_line(): + return 'text_without_new_line\tthe_same_row\tthe_same_row_continued' + + +@pytest.fixture +def small_text_partial(): + return ''.join(choice(ascii_letters) for _ in range(INIT_SNIFF_SIZE - 2)) + + +@pytest.fixture +def fp_small(small_text_partial): + return StringIO('{}\nanother_row\n'.format(small_text_partial)) + + +@pytest.fixture +def large_text_partial(): + return ''.join(choice(ascii_letters) for _ in range(MAX_RENDER_SIZE - INIT_SNIFF_SIZE)) + + +@pytest.fixture +def fp_large(large_text_partial): + return StringIO('{}\nanother_row\n'.format(large_text_partial)) + + +@pytest.fixture +def fp_empty(): + return StringIO('') + + +@pytest.fixture +def one_line_text(): + return ''.join(choice(ascii_letters) for _ in range(MAX_RENDER_SIZE - INIT_SNIFF_SIZE)) + + +@pytest.fixture +def fp_one_line(one_line_text): + return StringIO(one_line_text) + + +@pytest.fixture +def fp_oversize(): + oversize_text_partial = ''.join(choice(ascii_letters) for _ in range(MAX_RENDER_SIZE + 2)) + return StringIO('{}the_same_row\nanother_row\n'.format(oversize_text_partial)) + + +class TestFindNewLine: + + def test_find_new_line_lf(self, text_with_lf): + index = _find_new_line(text_with_lf) + assert index == 12 + + def test_find_new_line_cr(self, text_with_cr): + index = _find_new_line(text_with_cr) + assert index == 12 + + def test_find_new_line_cr_lf(self, text_with_cr_lf): + index = _find_new_line(text_with_cr_lf) + assert index == 15 + + def test_find_new_line_none(self, text_without_new_line): + index = _find_new_line(text_without_new_line) + assert index == -1 + + +class TestTrimORAppendData: + + def test_trim_or_append_data_small(self, fp_small, small_text_partial): + data = fp_small.read(INIT_SNIFF_SIZE) + data = _trim_or_append_data(fp_small, data, INIT_SNIFF_SIZE, 0, + max_render_size=MAX_RENDER_SIZE) + fp_small.close() + assert data == small_text_partial + + def test_trim_or_append_data_large(self, fp_large, large_text_partial): + data = fp_large.read(INIT_SNIFF_SIZE) + data = _trim_or_append_data(fp_large, data, INIT_SNIFF_SIZE, 0, + max_render_size=MAX_RENDER_SIZE) + fp_large.close() + assert data == large_text_partial + + def test_trim_or_append_data_empty(self, fp_empty): + data = fp_empty.read(INIT_SNIFF_SIZE) + data = _trim_or_append_data(fp_empty, data, INIT_SNIFF_SIZE, 0, + max_render_size=MAX_RENDER_SIZE) + fp_empty.close() + assert data == '' + + def test_trim_or_append_data_one_line(self, fp_one_line, one_line_text): + data = fp_one_line.read(INIT_SNIFF_SIZE) + data = _trim_or_append_data(fp_one_line, data, INIT_SNIFF_SIZE, 0, + max_render_size=MAX_RENDER_SIZE) + fp_one_line.close() + assert data == one_line_text + + def test_trim_or_append_data_error_upon_max_render_size(self, fp_oversize): + with pytest.raises(TabularRendererError): + data = fp_oversize.read(INIT_SNIFF_SIZE) + _trim_or_append_data(fp_oversize, data, INIT_SNIFF_SIZE, 0, + max_render_size=MAX_RENDER_SIZE) + fp_oversize.close()