From 3c5eeab21b6c74846be8f49d800716b875b1fe57 Mon Sep 17 00:00:00 2001 From: Kriyszig Date: Sun, 1 Sep 2019 15:20:09 +0530 Subject: [PATCH 1/5] Preliminary implementation of pivot operation * Added basic pivot operation * Added unittests for homogeneous and heterogeneous dataframe todo: * Add limiting conditions --- source/magpie/dataframe.d | 134 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/source/magpie/dataframe.d b/source/magpie/dataframe.d index 5dc9eca..6607a87 100644 --- a/source/magpie/dataframe.d +++ b/source/magpie/dataframe.d @@ -1829,6 +1829,90 @@ public: return dropperRuntimeInternal(pos[0 .. k]); } + + auto pivot(size_t col_size)(int[] index, int[] columns, int[] values) + { + import std.conv: to; + import std.algorithm: countUntil; + + DataFrame!(suitableType!RowType, col_size) ret; + Index inx; + string[][][2] indices; + string[] titles; + + indices[0].length = index.length; + titles.length = index.length; + indices[1].length = columns.length; + + static foreach(k; 0 .. 2) + { + foreach(pos, i; ((k == 0) ? index : columns)) + { + string[] indxdata; + string[] unique; + int end; + + static if(isHomogeneousType) + indxdata = to!(string[])(data[i]); + else + static foreach(j; 0 .. RowType.length) + if(i == j) + indxdata = to!(string[])(data[j]); + + unique.length = rows; + // Getting Unique indexes in order to prevent index collision + foreach(j; indxdata) + { + if(countUntil(unique, j) == -1) + { + unique[end] = j; + ++end; + } + } + + indices[k][pos] = unique[0 .. end]; + if(!k) + titles[pos] = "Index" ~ to!string(pos); + } + } + + const size_t level_size = indices[0][$ - 1].length; + inx.constructFromLevels!0(indices[0], titles); + inx.constructFromLevels!1(indices[1]); + ret.setFrameIndex(inx); + ret.rows = ret.indx.indexing[0].codes[0].length; + + foreach(i, ele; values) + { + if(i > col_size / level_size - 1) + break; + + toArr!(ret.RowType[0]) dfval; + static if(isHomogeneousType) + dfval = to!(toArr!(ret.RowType[0]))(data[ele]); + else + static foreach(j; 0 .. RowType.length) + { + if(j == ele) + dfval = to!(toArr!(ret.RowType[0]))(data[j]); + } + + size_t start; + while((start < dfval.length) && (start / level_size < ret.rows)) + { + foreach(j; 0 .. level_size) + { + if(start > dfval.length - 1) + break; + + ret.data[i * level_size + j][start / level_size] = dfval[start]; + ++start; + } + } + } + + return ret; + } } // Testing DataFrame Definition - O(n + log(n)) @@ -3879,3 +3963,53 @@ unittest ~ "Firm5 1.1 0.5 \n" ); } + +// Pivot Operation +unittest +{ + DataFrame!(float, 3) df; + Index inx; + inx[0] = ["0", "1", "2", "3"]; + inx[1] = ["Foo", "Bar", "Baz"]; + df.setFrameIndex(inx); + + df = [[1,3,1],[1,3,2],[2,4,3],[2,4,4]]; + + // Single Index + assert(df.pivot!2([1],[0],[2]).display(true, 200) == "Index0 1 2\n" + ~ "3 1 2\n" + ~ "4 3 4\n" + ); + + // Multi-Index + assert(df.pivot!4([1],[0, 1],[2, 1]).display(true, 200) == " 1 1 2 2\n" + ~ "Index0 3 4 3 4\n" + ~ "3 1 2 3 3\n" + ~ "4 3 4 4 4\n" + ); +} + +// Pivot Operation on heterogeneous DataFrame +unittest +{ + DataFrame!(int, double, 2) df; + Index inx; + inx[0] = ["0", "1", "2", "3"]; + inx[1] = ["Foo", "Bar", "Baz"]; + df.setFrameIndex(inx); + + df = [[1,3,1],[1,3,2],[2,4,3],[2,4,4]]; + + // Single Index + assert(df.pivot!2([1],[0],[2]).display(true, 200) == "Index0 1 2\n" + ~ "3 1 2\n" + ~ "4 3 4\n" + ); + + // Multi-Index + assert(df.pivot!4([1],[0, 1],[2, 1]).display(true, 200) == " 1 1 2 2\n" + ~ "Index0 3 4 3 4\n" + ~ "3 1 2 3 3\n" + ~ "4 3 4 4 4\n" + ); +} From 4cd40cb64f27dd2f39386a2db884426c425759b8 Mon Sep 17 00:00:00 2001 From: Kriyszig Date: Fri, 6 Sep 2019 11:01:17 +0530 Subject: [PATCH 2/5] Use foreach instead of while in pivot * Using foreach as the number of iterations can be found beforehand --- source/magpie/dataframe.d | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/source/magpie/dataframe.d b/source/magpie/dataframe.d index 6607a87..2ec3949 100644 --- a/source/magpie/dataframe.d +++ b/source/magpie/dataframe.d @@ -1833,7 +1833,7 @@ public: auto pivot(size_t col_size)(int[] index, int[] columns, int[] values) { import std.conv: to; - import std.algorithm: countUntil; + import std.algorithm: countUntil, max; DataFrame!(suitableType!RowType, col_size) ret; Index inx; @@ -1897,17 +1897,9 @@ public: dfval = to!(toArr!(ret.RowType[0]))(data[j]); } - size_t start; - while((start < dfval.length) && (start / level_size < ret.rows)) + foreach(j; 0 .. max(dfval.length, ret.rows * level_size)) { - foreach(j; 0 .. level_size) - { - if(start > dfval.length - 1) - break; - - ret.data[i * level_size + j][start / level_size] = dfval[start]; - ++start; - } + ret.data[(i * level_size) + (j % level_size)][j / level_size] = dfval[j]; } } From b46979eb2216e1196156eb0d81a9ae92fe8fd1af Mon Sep 17 00:00:00 2001 From: Kriyszig Date: Wed, 11 Sep 2019 09:30:44 +0530 Subject: [PATCH 3/5] Removed break by altering loop range * Changed loop range instead of using break --- source/magpie/dataframe.d | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/source/magpie/dataframe.d b/source/magpie/dataframe.d index 2ec3949..3dc53fb 100644 --- a/source/magpie/dataframe.d +++ b/source/magpie/dataframe.d @@ -1833,7 +1833,7 @@ public: auto pivot(size_t col_size)(int[] index, int[] columns, int[] values) { import std.conv: to; - import std.algorithm: countUntil, max; + import std.algorithm: countUntil, max, min; DataFrame!(suitableType!RowType, col_size) ret; Index inx; @@ -1882,18 +1882,15 @@ public: ret.setFrameIndex(inx); ret.rows = ret.indx.indexing[0].codes[0].length; - foreach(i, ele; values) + foreach(i; 0 .. min(values.length, col_size / level_size)) { - if(i > col_size / level_size - 1) - break; - toArr!(ret.RowType[0]) dfval; static if(isHomogeneousType) - dfval = to!(toArr!(ret.RowType[0]))(data[ele]); + dfval = to!(toArr!(ret.RowType[0]))(data[values[i]]); else static foreach(j; 0 .. RowType.length) { - if(j == ele) + if(j == values[i]) dfval = to!(toArr!(ret.RowType[0]))(data[j]); } From 7013a439950385084d71565b04f5eab380e68afc Mon Sep 17 00:00:00 2001 From: Kriyszig Date: Wed, 11 Sep 2019 09:51:03 +0530 Subject: [PATCH 4/5] BUGFIX: Take care of clipping conditions for pivot * Pivot takes care of clipping conditions when data size increases the frame size --- source/magpie/dataframe.d | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/source/magpie/dataframe.d b/source/magpie/dataframe.d index 3dc53fb..3dfac1f 100644 --- a/source/magpie/dataframe.d +++ b/source/magpie/dataframe.d @@ -1833,7 +1833,7 @@ public: auto pivot(size_t col_size)(int[] index, int[] columns, int[] values) { import std.conv: to; - import std.algorithm: countUntil, max, min; + import std.algorithm: countUntil, min; DataFrame!(suitableType!RowType, col_size) ret; Index inx; @@ -1876,7 +1876,7 @@ public: } } - const size_t level_size = indices[0][$ - 1].length; + const size_t level_size = indices[1][$ - 1].length; inx.constructFromLevels!0(indices[0], titles); inx.constructFromLevels!1(indices[1]); ret.setFrameIndex(inx); @@ -1894,7 +1894,7 @@ public: dfval = to!(toArr!(ret.RowType[0]))(data[j]); } - foreach(j; 0 .. max(dfval.length, ret.rows * level_size)) + foreach(j; 0 .. min(dfval.length, ret.rows * level_size)) { ret.data[(i * level_size) + (j % level_size)][j / level_size] = dfval[j]; } @@ -4002,3 +4002,19 @@ unittest ~ "4 3 4 4 4\n" ); } + +// Pivot Operation on heterogeneous DataFrame +unittest +{ + DataFrame!(int, double, 2) df; + Index inx; + inx[0] = ["0", "1", "2", "3"]; + inx[1] = ["Foo", "Bar", "Baz"]; + df.setFrameIndex(inx); + + df = [[1,3,1],[1,3,2],[1,4,3],[1,4,4]]; + assert(df.pivot!1([1], [0], [2]).display(true, 200) == "Index0 1\n" + ~ "3 1\n" + ~ "4 2\n" + ); +} From 5eafaf5170485d266952303f5fb59bfd4f4fd903 Mon Sep 17 00:00:00 2001 From: Kriyszig Date: Fri, 13 Sep 2019 08:45:53 +0530 Subject: [PATCH 5/5] Added conditions for pivot * Added necessary conditions to allow pivot operation --- source/magpie/dataframe.d | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/source/magpie/dataframe.d b/source/magpie/dataframe.d index 3dfac1f..6027b60 100644 --- a/source/magpie/dataframe.d +++ b/source/magpie/dataframe.d @@ -1835,6 +1835,11 @@ public: import std.conv: to; import std.algorithm: countUntil, min; + + static assert(col_size > 0, "Cannot construct a DataFrame with no columns"); + assert(index.length && columns.length, "DataFrame cannot be pivoted without specifying index values"); + assert(values.length > 0, "Cannot pivot a DataFrame with no values"); + DataFrame!(suitableType!RowType, col_size) ret; Index inx; string[][][2] indices;