Skip to content
This repository was archived by the owner on Jan 6, 2025. It is now read-only.

Commit 1f71513

Browse files
committed
Fix no table found warning and add tests for two tables
1 parent bf89411 commit 1f71513

File tree

5 files changed

+162
-4
lines changed

5 files changed

+162
-4
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ install:
1515
pip install ".[dev]"
1616

1717
test:
18-
pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests
18+
pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl
1919

2020
docs:
2121
cd docs && make html

camelot/parsers/stream.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -309,10 +309,21 @@ def _generate_columns_and_rows(self, table_idx, tk):
309309
cols.append(text_x_max)
310310
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
311311
else:
312+
# calculate mode of the list of number of elements in
313+
# each row to guess the number of columns
312314
ncols = max(set(elements), key=elements.count)
313315
if ncols == 1:
314-
warnings.warn("No tables found on {}".format(
315-
os.path.basename(self.rootname)))
316+
# if mode is 1, the page usually contains not tables
317+
# but there can be cases where the list can be skewed,
318+
# try to remove all 1s from list in this case and
319+
# see if the list contains elements, if yes, then use
320+
# the mode after removing 1s
321+
elements = list(filter(lambda x: x != 1, elements))
322+
if len(elements):
323+
ncols = max(set(elements), key=elements.count)
324+
else:
325+
warnings.warn("No tables found in table area {}".format(
326+
table_idx + 1))
316327
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
317328
cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol)
318329
inner_text = []

setup.cfg

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
test=pytest
33

44
[tool:pytest]
5-
addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests
5+
addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl
66
python_files = tests/test_*.py

tests/data.py

+125
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,99 @@
7474
["NFHS-1 (1992-93)", "57.7", "37.6", "26.5", "4.3", "3.6", "1.3", "0.1", "1.9", "na", "na", "11.3", "8.3", "na", "42.3", "100.0", "3,970"]
7575
]
7676

77+
data_stream_two_tables_1 = [
78+
["[In thousands (11,062.6 represents 11,062,600) For year ending December 31. Based on Uniform Crime Reporting (UCR)", "", "", "", "", "", "", "", "", ""],
79+
["Program. Represents arrests reported (not charged) by 12,910 agencies with a total population of 247,526,916 as estimated", "", "", "", "", "", "", "", "", ""],
80+
["by the FBI. Some persons may be arrested more than once during a year, therefore, the data in this table, in some cases,", "", "", "", "", "", "", "", "", ""],
81+
["could represent multiple arrests of the same person. See text, this section and source]", "", "", "", "", "", "", "", "", ""],
82+
["", "", "Total", "", "", "Male", "", "", "Female", ""],
83+
["Offense charged", "", "Under 18", "18 years", "", "Under 18", "18 years", "", "Under 18", "18 years"],
84+
["", "Total", "years", "and over", "Total", "years", "and over", "Total", "years", "and over"],
85+
["Total . . . . . . . . . . . . . . . . . . . . . . . . .", "11,062 .6", "1,540 .0", "9,522 .6", "8,263 .3", "1,071 .6", "7,191 .7", "2,799 .2", "468 .3", "2,330 .9"],
86+
["Violent crime . . . . . . . . . . . . . . . . . .", "467 .9", "69 .1", "398 .8", "380 .2", "56 .5", "323 .7", "87 .7", "12 .6", "75 .2"],
87+
["Murder and nonnegligent", "", "", "", "", "", "", "", "", ""],
88+
["manslaughter . . . . . . . .. .. .. .. ..", "10.0", "0.9", "9.1", "9.0", "0.9", "8.1", "1.1", "–", "1.0"],
89+
["Forcible rape . . . . . . . .. .. .. .. .. .", "17.5", "2.6", "14.9", "17.2", "2.5", "14.7", "–", "–", "–"],
90+
["Robbery . . . .. .. . .. . ... . ... . ...", "102.1", "25.5", "76.6", "90.0", "22.9", "67.1", "12.1", "2.5", "9.5"],
91+
["Aggravated assault . . . . . . . .. .. ..", "338.4", "40.1", "298.3", "264.0", "30.2", "233.8", "74.4", "9.9", "64.5"],
92+
["Property crime . . . . . . . . . . . . . . . . .", "1,396 .4", "338 .7", "1,057 .7", "875 .9", "210 .8", "665 .1", "608 .2", "127 .9", "392 .6"],
93+
["Burglary . .. . . . . .. ... .... .... ..", "240.9", "60.3", "180.6", "205.0", "53.4", "151.7", "35.9", "6.9", "29.0"],
94+
["Larceny-theft . . . . . . . .. .. .. .. .. .", "1,080.1", "258.1", "822.0", "608.8", "140.5", "468.3", "471.3", "117.6", "353.6"],
95+
["Motor vehicle theft . . . . .. .. . .... .", "65.6", "16.0", "49.6", "53.9", "13.3", "40.7", "11.7", "2.7", "8.9"],
96+
["Arson .. . . . .. . ... .... .... .... .", "9.8", "4.3", "5.5", "8.1", "3.7", "4.4", "1.7", "0.6", "1.1"],
97+
["Other assaults .. . . . . .. . ... . ... ..", "1,061.3", "175.3", "886.1", "785.4", "115.4", "670.0", "276.0", "59.9", "216.1"],
98+
["Forgery and counterfeiting .. . . . . . ..", "68.9", "1.7", "67.2", "42.9", "1.2", "41.7", "26.0", "0.5", "25.5"],
99+
["Fraud .... .. . . .. ... .... .... ....", "173.7", "5.1", "168.5", "98.4", "3.3", "95.0", "75.3", "1.8", "73.5"],
100+
["Embezzlement . . .. . . . .. . ... . ....", "14.6", "–", "14.1", "7.2", "–", "6.9", "7.4", "–", "7.2"],
101+
["Stolen property 1 . . . . . . .. . .. .. ...", "84.3", "15.1", "69.2", "66.7", "12.2", "54.5", "17.6", "2.8", "14.7"],
102+
["Vandalism . . . . . . . .. .. .. .. .. ....", "217.4", "72.7", "144.7", "178.1", "62.8", "115.3", "39.3", "9.9", "29.4"],
103+
["Weapons; carrying, possessing, etc. .", "132.9", "27.1", "105.8", "122.1", "24.3", "97.8", "10.8", "2.8", "8.0"],
104+
["Prostitution and commercialized vice",
105+
"56.9", "1.1", "55.8", "17.3", "–", "17.1", "39.6", "0.8", "38.7"],
106+
["Sex offenses 2 . . . . .. . . . .. .. .. . ..", "61.5", "10.7", "50.7", "56.1", "9.6", "46.5", "5.4", "1.1", "4.3"],
107+
["Drug abuse violations . . . . . . . .. ...", "1,333.0", "136.6", "1,196.4", "1,084.3", "115.2", "969.1", "248.7", "21.4", "227.3"],
108+
["Gambling .. . . . . .. ... . ... . ... ...", "8.2", "1.4", "6.8", "7.2", "1.4", "5.9", "0.9", "–", "0.9"],
109+
["Offenses against the family and", "", "", "", "", "", "", "", "", ""],
110+
["children . . . .. . . .. .. .. .. .. .. . ..", "92.4", "3.7", "88.7", "68.9", "2.4", "66.6", "23.4", "1.3", "22.1"],
111+
["Driving under the influence . . . . . .. .", "1,158.5", "109.2", "1,147.5", "895.8", "8.2", "887.6", "262.7", "2.7", "260.0"],
112+
["Liquor laws . . . . . . . .. .. .. .. .. .. .", "48.2", "90.2", "368.0", "326.8", "55.4", "271.4",
113+
"131.4", "34.7", "96.6"],
114+
["Drunkenness . . .. . . . .. . ... . ... ..", "488.1", "11.4", "476.8", "406.8", "8.5", "398.3", "81.3", "2.9", "78.4"],
115+
["Disorderly conduct . .. . . . . . .. .. .. .", "529.5", "136.1", "393.3", "387.1", "90.8", "296.2", "142.4", "45.3", "97.1"],
116+
["Vagrancy . . . .. . . . ... .... .... ...", "26.6", "2.2", "24.4", "20.9", "1.6", "19.3", "5.7", "0.6", "5.1"],
117+
["All other offenses (except traffic) . . ..", "306.1", "263.4", "2,800.8", "2,337.1", "194.2", "2,142.9", "727.0", "69.2", "657.9"],
118+
["Suspicion . . . .. . . .. .. .. .. .. .. . ..", "1.6", "–", "1.4", "1.2", "–", "1.0", "–", "–", "–"],
119+
["Curfew and loitering law violations ..", "91.0", "91.0", "(X)", "63.1", "63.1", "(X)", "28.0", "28.0", "(X)"],
120+
["Runaways . . . . . . . .. .. .. .. .. ....", "75.8", "75.8", "(X)", "34.0", "34.0", "(X)", "41.8", "41.8", "(X)"],
121+
["", "– Represents zero. X Not applicable. 1 Buying, receiving, possessing stolen property. 2 Except forcible rape and prostitution.", "", "", "", "", "", "", "", ""],
122+
["", "Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.", "", "", "", "", "", "", "", ""]
123+
]
124+
125+
data_stream_two_tables_2 = [
126+
["", "Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.", "", "", "", ""],
127+
["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
128+
["[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies", "", "", "", "", ""],
129+
["with a total population of 239,839,971 as estimated by the FBI. See headnote, Table 324]", "", "", "", "", ""],
130+
["", "", "", "", "American", ""],
131+
["Offense charged", "", "", "",
132+
"Indian/Alaskan", "Asian Pacific"],
133+
["", "Total", "White", "Black", "Native", "Islander"],
134+
["Total . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .", "10,690,561", "7,389,208", "3,027,153", "150,544", "123,656"],
135+
["Violent crime . . . . . . . . . . . . . . . . . . . . . . . . . . . .", "456,965", "268,346", "177,766", "5,608", "5,245"],
136+
["Murder and nonnegligent manslaughter . .. ... .", "9,739", "4,741", "4,801", "100", "97"],
137+
["Forcible rape . . . . . . . .. .. .. .. .... .. ...... .", "16,362", "10,644", "5,319", "169", "230"],
138+
["Robbery . . . . .. . . . ... . ... . .... .... .... . . .", "100,496", "43,039", "55,742", "726", "989"],
139+
["Aggravated assault . . . . . . . .. .. ...... .. ....", "330,368", "209,922", "111,904", "4,613", "3,929"],
140+
["Property crime . . . . . . . . . . . . . . . . . . . . . . . . . . .", "1,364,409", "922,139", "406,382", "17,599", "18,289"],
141+
["Burglary . . .. . . . .. . .... .... .... .... ... . . .", "234,551", "155,994", "74,419", "2,021", "2,117"],
142+
["Larceny-theft . . . . . . . .. .. .. .. .... .. ...... .", "1,056,473", "719,983", "306,625", "14,646", "15,219"],
143+
["Motor vehicle theft . . . . . .. ... . ... ..... ... ..", "63,919", "39,077", "23,184", "817", "841"],
144+
["Arson .. . . .. .. .. ... .... .... .... .... . . . . .", "9,466", "7,085", "2,154", "115", "112"],
145+
["Other assaults .. . . . . . ... . ... . ... ..... ... ..", "1,032,502", "672,865", "332,435", "15,127", "12,075"],
146+
["Forgery and counterfeiting .. . . . . . ... ..... .. ..", "67,054", "44,730", "21,251", "345", "728"],
147+
["Fraud ... . . . . .. .. .. .. .. .. .. .. .. .... . . . . . .", "161,233", "108,032", "50,367", "1,315", "1,519"],
148+
["Embezzlement . . . .. . . . ... . ... . .... ... .....", "13,960", "9,208", "4,429", "75", "248"],
149+
["Stolen property; buying, receiving, possessing .. .", "82,714", "51,953", "29,357", "662", "742"],
150+
["Vandalism . . . . . . . .. .. .. .. .. .. .... .. ..... .", "212,173", "157,723", "48,746", "3,352", "2,352"],
151+
["Weapons—carrying, possessing, etc. .. .. ... .. .", "130,503", "74,942", "53,441", "951", "1,169"],
152+
["Prostitution and commercialized vice . ... .. .. ..", "56,560", "31,699", "23,021", "427", "1,413"],
153+
["Sex offenses 1 . . . . . . . .. .. .. .. .... .. ...... .", "60,175", "44,240", "14,347", "715", "873"],
154+
["Drug abuse violations . . . . . . . .. . ..... .. .....", "1,301,629", "845,974", "437,623", "8,588", "9,444"],
155+
["Gambling . . . . .. . . . ... . ... . .. ... . ...... .. .", "8,046", "2,290", "5,518", "27", "211"],
156+
["Offenses against the family and children ... .. .. .", "87,232", "58,068", "26,850", "1,690", "624"],
157+
["Driving under the influence . . . . . . .. ... ...... .", "1,105,401", "954,444", "121,594", "14,903", "14,460"],
158+
["Liquor laws . . . . . . . .. .. .. .. .. . ..... .. .....", "444,087", "373,189", "50,431", "14,876", "5,591"],
159+
["Drunkenness . .. . . . . . ... . ... . ..... . .......", "469,958", "387,542", "71,020", "8,552", "2,844"],
160+
["Disorderly conduct . . .. . . . . .. .. . ..... .. .....", "515,689", "326,563", "176,169", "8,783", "4,174"],
161+
["Vagrancy . . .. .. . . .. ... .... .... .... .... . . .", "26,347", "14,581", "11,031", "543", "192"],
162+
["All other offenses (except traffic) . .. .. .. ..... ..", "2,929,217", "1,937,221", "911,670", "43,880", "36,446"],
163+
["Suspicion . . .. . . . .. .. .. .. .. .. .. ...... .. . . .", "1,513", "677", "828", "1", "7"],
164+
["Curfew and loitering law violations . .. ... .. ....", "89,578", "54,439", "33,207", "872", "1,060"],
165+
["Runaways . . . . . . . .. .. .. .. .. .. .... .. ..... .", "73,616", "48,343", "19,670", "1,653", "3,950"],
166+
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
167+
["", "Source: U.S. Department of Justice, Federal Bureau of Investigation, “Crime in the United States, Arrests,” September 2010,", "", "", "", ""]
168+
]
169+
77170
data_stream_table_areas = [
78171
["", "One Withholding"],
79172
["Payroll Period", "Allowance"],
@@ -248,6 +341,38 @@
248341
["Pooled", "38742", "53618", "60601", "86898", "4459", "21918", "27041", "14312", "18519"]
249342
]
250343

344+
data_lattice_two_tables_1 = [
345+
["State", "n", "Literacy Status", "", "", "", "", ""],
346+
["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College"],
347+
["Kerala", "2400", "7.2", "0.5", "25.3", "20.1", "41.5", "5.5"],
348+
["Tamil Nadu", "2400", "21.4", "2.3", "8.8", "35.5", "25.8", "6.2"],
349+
["Karnataka", "2399", "37.4", "2.8", "12.5", "18.3", "23.1", "5.8"],
350+
["Andhra Pradesh", "2400", "54.0", "1.7", "8.4", "13.2", "18.8", "3.9"],
351+
["Maharashtra", "2400", "22.0", "0.9", "17.3", "20.3", "32.6", "7.0"],
352+
["Gujarat", "2390", "28.6", "0.1", "14.4", "23.1", "26.9", "6.8"],
353+
["Madhya Pradesh", "2402", "29.1", "3.4", "8.5", "35.1", "13.3", "10.6"],
354+
["Orissa", "2405", "33.2", "1.0", "10.4", "25.7", "21.2", "8.5"],
355+
["West Bengal", "2293", "41.7", "4.4", "13.2", "17.1", "21.2", "2.4"],
356+
["Uttar Pradesh", "2400", "35.3", "2.1", "4.5", "23.3", "27.1", "7.6"],
357+
["Pooled", "23889", "30.9", "1.9", "12.3", "23.2", "25.2", "6.4"]
358+
]
359+
360+
data_lattice_two_tables_2 = [
361+
["State", "n", "Literacy Status", "", "", "", "", ""],
362+
["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College"],
363+
["Kerala", "2400", "8.8", "0.3", "20.1", "17.0", "45.6", "8.2"],
364+
["Tamil Nadu", "2400", "29.9", "1.5", "8.5", "33.1", "22.3", "4.8"],
365+
["Karnataka", "2399", "47.9", "2.5", "10.2", "18.8", "18.4", "2.3"],
366+
["Andhra Pradesh", "2400", "66.4", "0.7", "6.8", "12.9", "11.4", "1.8"],
367+
["Maharashtra", "2400", "41.3", "0.6", "14.1", "20.1", "21.6", "2.2"],
368+
["Gujarat", "2390", "57.6", "0.1", "10.3", "16.5", "12.9", "2.7"],
369+
["Madhya Pradesh", "2402", "58.7", "2.2", "6.6", "24.1", "5.3", "3.0"],
370+
["Orissa", "2405", "50.0", "0.9", "8.1", "21.9", "15.1", "4.0"],
371+
["West Bengal", "2293", "49.1", "4.8", "11.2", "16.8", "17.1", "1.1"],
372+
["Uttar Pradesh", "2400", "67.3", "2.0", "3.1", "17.2", "7.7", "2.7"],
373+
["Pooled", "23889", "47.7", "1.5", "9.9", "19.9", "17.8", "3.3"]
374+
]
375+
251376
data_lattice_table_areas = [
252377
["", "", "", "", "", "", "", "", ""],
253378
["State", "n", "Literacy Status", "", "", "", "", "", ""],

tests/test_common.py

+22
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,17 @@ def test_stream_table_rotated():
5656
assert df.equals(tables[0].df)
5757

5858

59+
def test_stream_two_tables():
60+
df1 = pd.DataFrame(data_stream_two_tables_1)
61+
df2 = pd.DataFrame(data_stream_two_tables_2)
62+
63+
filename = os.path.join(testdir, "tabula/12s0324.pdf")
64+
tables = camelot.read_pdf(filename, flavor='stream')
65+
assert len(tables) == 2
66+
assert df1.equals(tables[0].df)
67+
assert df2.equals(tables[1].df)
68+
69+
5970
def test_stream_table_areas():
6071
df = pd.DataFrame(data_stream_table_areas)
6172

@@ -111,6 +122,17 @@ def test_lattice_table_rotated():
111122
assert df.equals(tables[0].df)
112123

113124

125+
def test_lattice_two_tables():
126+
df1 = pd.DataFrame(data_lattice_two_tables_1)
127+
df2 = pd.DataFrame(data_lattice_two_tables_2)
128+
129+
filename = os.path.join(testdir, "twotables_2.pdf")
130+
tables = camelot.read_pdf(filename)
131+
assert len(tables) == 2
132+
assert df1.equals(tables[0].df)
133+
assert df2.equals(tables[1].df)
134+
135+
114136
def test_lattice_table_areas():
115137
df = pd.DataFrame(data_lattice_table_areas)
116138

0 commit comments

Comments
 (0)