Skip to content

Commit f6a665b

Browse files
gjeukenZethsonpre-commit-ci[bot]
authored
A memory efficient implementation of the .mtx reading function (#3389)
Co-authored-by: Lukas Heumos <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent d2db761 commit f6a665b

File tree

1 file changed

+10
-2
lines changed

1 file changed

+10
-2
lines changed

src/scanpy/datasets/_ebi_expression_atlas.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -67,13 +67,21 @@ def read_mtx_from_stream(stream: BinaryIO) -> sparse.csr_matrix:
6767
max_int32 = np.iinfo(np.int32).max
6868
coord_dtype = np.int64 if n > max_int32 or m > max_int32 else np.int32
6969

70-
data = pd.read_csv(
70+
chunks = pd.read_csv(
7171
stream,
7272
sep=r"\s+",
7373
header=None,
7474
dtype={0: coord_dtype, 1: coord_dtype, 2: np.float32},
75+
chunksize=1e7,
7576
)
76-
mtx = sparse.csr_matrix((data[2], (data[1] - 1, data[0] - 1)), shape=(m, n))
77+
data = np.array([], dtype=np.float64)
78+
i = np.array([], dtype=int)
79+
j = np.array([], dtype=int)
80+
for chunk in chunks:
81+
data = np.append(data, chunk[2])
82+
i = np.append(i, chunk[1] - 1)
83+
j = np.append(j, chunk[0] - 1)
84+
mtx = sparse.csr_matrix((data, (i, j)), shape=(m, n))
7785
return mtx
7886

7987

0 commit comments

Comments
 (0)