diff --git a/README.md b/README.md index 8c3ca625..a4dda6f0 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ DataFrame is a tabular data structure for data analysis in [Pharo](https://pharo To install the latest stable version of DataFrame (`pre-v3`), go to the Playground (`Ctrl+OW`) in your Pharo image and execute the following Metacello script (select it and press Do-it button or `Ctrl+D`): ```st -EpMonitor disableDuring: [ +EpMonitor disableDuring: [ Metacello new baseline: 'DataFrame'; repository: 'github://PolyMathOrg/DataFrame:pre-v3/src'; @@ -21,13 +21,23 @@ EpMonitor disableDuring: [ Use this script if you want the latest version of DataFrame: ```st -EpMonitor disableDuring: [ +EpMonitor disableDuring: [ Metacello new baseline: 'DataFrame'; repository: 'github://PolyMathOrg/DataFrame/src'; load ]. ``` +If you'd be interested in SQLite support, use `load: 'sqlite'` at the end: + +```st +EpMonitor disableDuring: [ + Metacello new + baseline: 'DataFrame'; + repository: 'github://PolyMathOrg/DataFrame/src'; + load: 'sqlite' ]. +``` + _Note:_ `EpMonitor` serves to deactive [Epicea](https://github.com/pharo-open-documentation/pharo-wiki/blob/3cfb4ebc19821d607bec35c34ee928b4e06822ee/General/TweakingBigImages.md#disable-epicea), a Pharo code recovering mechanism, during the installation of DataFrame. ## How to depend on it? @@ -52,7 +62,7 @@ A data frame is like a database inside a variable. It is an object which can be In this section I show a very simple example of creating and manipulating a little data frame. For more advanced examples, please check the [DataFrame Booklet](#dataframe-booklet). -### Creating a data frame +### Creating a data frame ```Smalltalk weather := DataFrame withRows: #( @@ -120,6 +130,34 @@ weather transposed. | **2** | true | true | false | true | true | | **3** | snow | rain | - | rain | snow | +### SQLite examples +*Following examples expect valid/connected SQLite connection in a variable `conn`* +#### Load data from SQLite query: +```st +df := DataFrame readFromSqliteCursor: (conn execute: 'SELECT * FROM table'). +``` +#### Write data to SQLite table (DataFrame column names <=> table column names): +```st +df writeToSqlite: conn tableName: 'table'. +``` +#### Write to differently named colums (provide names for ALL DataFrame columns!) +```st +df writeToSqlite: conn tableName: 'table' columnNames: #('col1' 'col2' 'col3'). +``` +#### Mapping (selecting / renaming dataframe columns): +Let's assume: +- CREATE TABLE tbl (a,b,c) +- DataFrame with columns (a,x,c,d) +- We want to write: + - a to a + - x to b + - c to c + - ignore d +- NB: no mention of column d, order is irrelevant +```st +df writeToSqlite: conn tableName: 'table' columnMappings: { #c. #x -> #b. #a }. +``` + ## Documentation and Literature 1. [Data Analysis Made Simple with Pharo DataFrame](https://github.com/SquareBracketAssociates/Booklet-DataFrame) - a booklet that serves as the main source of documentation for the DataFrame project. It describes the complete API of DataFrame and DataSeries data structures, and provides examples for each method. diff --git a/src/BaselineOfDataFrame/BaselineOfDataFrame.class.st b/src/BaselineOfDataFrame/BaselineOfDataFrame.class.st index 3e671122..cc331e7b 100644 --- a/src/BaselineOfDataFrame/BaselineOfDataFrame.class.st +++ b/src/BaselineOfDataFrame/BaselineOfDataFrame.class.st @@ -1,10 +1,11 @@ Class { - #name : #BaselineOfDataFrame, - #superclass : #BaselineOf, - #category : #BaselineOfDataFrame + #name : 'BaselineOfDataFrame', + #superclass : 'BaselineOf', + #category : 'BaselineOfDataFrame', + #package : 'BaselineOfDataFrame' } -{ #category : #baselines } +{ #category : 'baselines' } BaselineOfDataFrame >> baseline: spec [ spec for: #common do: [ @@ -18,13 +19,20 @@ BaselineOfDataFrame >> baseline: spec [ spec baseline: 'AINormalization' with: [ spec repository: 'github://pharo-ai/normalization/src' ]. + spec + baseline: 'SQLite3' + with: [ spec repository: 'github://pharo-rdbms/Pharo-SQLite3/src' ]. "Packages" spec package: 'DataFrame' with: [ spec requires: #('AINormalization') ]; package: 'DataFrame-Tests' with: [ spec requires: #('DataFrame') ]; package: 'DataFrame-IO' with: [ spec requires: #('DataFrame' 'NeoCSV' 'NeoJSON') ]; + package: 'DataFrame-IO-Sqlite' with: [ spec requires: #('DataFrame' 'SQLite3') ]; package: 'DataFrame-IO-Tests' with: [ spec requires: #('DataFrame-IO') ] ]. + + spec group: 'default' with: #('DataFrame-IO-Tests'). + spec group: 'sqlite' with: #('DataFrame-IO-Sqlite' 'DataFrame-IO-Tests'). spec for: #'pharo7.x' @@ -37,5 +45,5 @@ BaselineOfDataFrame >> baseline: spec [ do: [ spec package: 'DataFrame-Pharo6'; - package: 'DataFrame-Pharo67' ] + package: 'DataFrame-Pharo67' ] ] diff --git a/src/BaselineOfDataFrame/package.st b/src/BaselineOfDataFrame/package.st index 5ebc01ee..985805ac 100644 --- a/src/BaselineOfDataFrame/package.st +++ b/src/BaselineOfDataFrame/package.st @@ -1 +1 @@ -Package { #name : #BaselineOfDataFrame } +Package { #name : 'BaselineOfDataFrame' } diff --git a/src/DataFrame-IO-Sqlite/DataFrame.extension.st b/src/DataFrame-IO-Sqlite/DataFrame.extension.st new file mode 100644 index 00000000..de8dacef --- /dev/null +++ b/src/DataFrame-IO-Sqlite/DataFrame.extension.st @@ -0,0 +1,35 @@ +Extension { #name : 'DataFrame' } + +{ #category : '*DataFrame-IO-Sqlite' } +DataFrame class >> readFromSqliteCursor: aSqliteCursor [ + "Convenience shortcut for SQLite3Cursor => DataFrame" + ^ self readFrom: aSqliteCursor using: DataFrameSqliteReader new +] + +{ #category : '*DataFrame-IO-Sqlite' } +DataFrame >> writeToSqlite: aSqlite3Connection tableName: aString [ + + | writer | + writer := DataFrameSqliteWriter writeToTable: aString. + self writeTo: aSqlite3Connection using: writer +] + +{ #category : '*DataFrame-IO-Sqlite' } +DataFrame >> writeToSqlite: aSqlite3Connection tableName: aString columnMappings: aCollection [ + + | writer | + writer := DataFrameSqliteWriter + writeToTable: aString + columnMappings: aCollection. + self writeTo: aSqlite3Connection using: writer +] + +{ #category : '*DataFrame-IO-Sqlite' } +DataFrame >> writeToSqlite: aSqlite3Connection tableName: aString columnNames: aCollection [ + + | writer | + writer := DataFrameSqliteWriter + writeToTable: aString + columnMappings: aCollection. + self writeTo: aSqlite3Connection using: writer +] diff --git a/src/DataFrame-IO-Sqlite/DataFrameSqliteReader.class.st b/src/DataFrame-IO-Sqlite/DataFrameSqliteReader.class.st new file mode 100644 index 00000000..b508cbd6 --- /dev/null +++ b/src/DataFrame-IO-Sqlite/DataFrameSqliteReader.class.st @@ -0,0 +1,15 @@ +Class { + #name : 'DataFrameSqliteReader', + #superclass : 'DataFrameReader', + #category : 'DataFrame-IO-Sqlite', + #package : 'DataFrame-IO-Sqlite' +} + +{ #category : 'reading' } +DataFrameSqliteReader >> readFrom: aSqliteCursor [ + "Read all rows from cursor, stuff them into a new dataframe with columns of same name" + + | cols | + cols := aSqliteCursor columnNames. "need to grab columns before exhausting the cursor" + ^ DataFrame withRows: aSqliteCursor rows columnNames: cols +] diff --git a/src/DataFrame-IO-Sqlite/DataFrameSqliteWriter.class.st b/src/DataFrame-IO-Sqlite/DataFrameSqliteWriter.class.st new file mode 100644 index 00000000..ca167f92 --- /dev/null +++ b/src/DataFrame-IO-Sqlite/DataFrameSqliteWriter.class.st @@ -0,0 +1,105 @@ +Class { + #name : 'DataFrameSqliteWriter', + #superclass : 'DataFrameWriter', + #instVars : [ + 'tableName', + 'columnMappings' + ], + #category : 'DataFrame-IO-Sqlite', + #package : 'DataFrame-IO-Sqlite' +} + +{ #category : 'writing' } +DataFrameSqliteWriter class >> writeToTable: aString [ + + ^ self new + tableName: aString; + yourself +] + +{ #category : 'writing' } +DataFrameSqliteWriter class >> writeToTable: aString columnMappings: aCollection [ + + ^ self new + tableName: aString; + columnMappings: aCollection; + yourself +] + +{ #category : 'accessing' } +DataFrameSqliteWriter >> columnMappings [ + + ^ columnMappings +] + +{ #category : 'accessing' } +DataFrameSqliteWriter >> columnMappings: anObject [ + + columnMappings := anObject +] + +{ #category : 'helpers' } +DataFrameSqliteWriter >> fieldIndicesFor: aDataFrame [ + "gather indices of columns in dataframe (to avoid lookup by field name later, in loop)" + + ^ (self getColumnMappings: aDataFrame) collect: [ :m | + | sourceName | + sourceName := m isAssociation + ifTrue: [ m key ] + ifFalse: [ m ]. + aDataFrame columnNames indexOf: sourceName ] +] + +{ #category : 'helpers' } +DataFrameSqliteWriter >> getColumnMappings: aDataFrame [ + + ^ columnMappings ifNil: [ aDataFrame columnNames ] +] + +{ #category : 'helpers' } +DataFrameSqliteWriter >> getColumnNames: aDataFrame [ + + ^ (self getColumnMappings: aDataFrame) collect: [ :m | m value ] +] + +{ #category : 'helpers' } +DataFrameSqliteWriter >> insertQueryForColumns: aSequence [ + "" + ^ String streamContents: [ :strm | + strm + nextPutAll: 'INSERT INTO '; + nextPutAll: tableName; + nextPut: $(; + nextPutAll: (',' join: aSequence); + nextPutAll: ')VALUES('. + aSequence do: [ :ignore | strm nextPut: $? ] separatedBy: [ strm nextPut: $, ]. + strm nextPut: $) ] +] + +{ #category : 'accessing' } +DataFrameSqliteWriter >> tableName [ + + ^ tableName +] + +{ #category : 'accessing' } +DataFrameSqliteWriter >> tableName: anObject [ + + tableName := anObject +] + +{ #category : 'writing' } +DataFrameSqliteWriter >> write: aDataFrame to: aSqliteConnection [ + + | fieldIndices args stmt | + fieldIndices := self fieldIndicesFor: aDataFrame. + args := Array new: fieldIndices size. + stmt := aSqliteConnection prepare: + (self insertQueryForColumns: + (self getColumnNames: aDataFrame)). + + 1 to: aDataFrame dimensions x do: [ :rowIndex | + fieldIndices withIndexDo: [ :srcCol :dstCol | + args at: dstCol put: (aDataFrame contents at: rowIndex at: srcCol) ]. + stmt execute: args ] +] diff --git a/src/DataFrame-IO-Sqlite/package.st b/src/DataFrame-IO-Sqlite/package.st new file mode 100644 index 00000000..9673a3c0 --- /dev/null +++ b/src/DataFrame-IO-Sqlite/package.st @@ -0,0 +1 @@ +Package { #name : 'DataFrame-IO-Sqlite' } diff --git a/src/DataFrame/DataFrame.class.st b/src/DataFrame/DataFrame.class.st index 15f7176c..649d77d1 100644 --- a/src/DataFrame/DataFrame.class.st +++ b/src/DataFrame/DataFrame.class.st @@ -995,22 +995,6 @@ DataFrame >> crossTabulate: colName1 with: colName2 [ ^ col1 crossTabulateWith: col2 ] -{ #category : 'copying' } -DataFrame >> dataPreProcessingEncodeWith: anEncoder [ - "This method is here to speed up pharo-ai/data-preprocessing algos without coupling both projects." - - | copy cache | - copy := self copy. - cache := IdentityDictionary new. - self columns doWithIndex: [ :dataSerie :columnIndex | - | category | - category := cache at: columnIndex ifAbsentPut: [ ((anEncoder categories at: columnIndex) collectWithIndex: [ :elem :index | elem -> index ]) asDictionary ]. - dataSerie doWithIndex: [ :element :rowIndex | - copy at: rowIndex at: columnIndex put: (category at: element ifAbsent: [ AIMissingCategory signalFor: element ]) ] ]. - - ^ copy -] - { #category : 'data-types' } DataFrame >> dataTypeOfColumn: aColumnName [ "Given a column name of the DataFrame, it returns the data type of that column"