Skip to content

Commit 2fce667

Browse files
authored
Completed major features
* Switch application structure to an umbrella app * Add dashboard web app to provide a browser interface to the application * Add PostgreSQL support * Minor improvements in the overall codebase * Removed ability to provide configurations using environment variables * Improve readme and add license
2 parents 4a66dac + f04272d commit 2fce667

File tree

101 files changed

+13077
-1108
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

101 files changed

+13077
-1108
lines changed

.github/images/cmd.gif

25.2 MB
Loading

.github/images/csv2sql.png

14.9 KB
Loading

.github/images/dashboard.gif

30 MB
Loading

.gitignore

-12
Original file line numberDiff line numberDiff line change
@@ -19,20 +19,8 @@ erl_crash.dump
1919
# Also ignore archive artifacts (built via "mix archive.build").
2020
*.ez
2121

22-
# Ignore package tarball (built via "mix hex.build").
23-
csv2sql-*.tar
24-
2522
# linter
2623
/.elixir_ls/
2724

28-
# schema file
29-
schema.sql
30-
31-
# config file
32-
/config.env
33-
3425
# Formatting file
3526
.formatter.exs
36-
37-
# escipt binary
38-
csv2sql

LICENSE.md

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) [year] [fullname]
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

+169-191
Large diffs are not rendered by default.

apps/csv2sql/.gitignore

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# The directory Mix will write compiled artifacts to.
2+
/_build/
3+
4+
# If you run "mix test --cover", coverage assets end up here.
5+
/cover/
6+
7+
# The directory Mix downloads your dependencies sources to.
8+
/deps/
9+
10+
# Where third-party dependencies like ExDoc output generated docs.
11+
/doc/
12+
13+
# Ignore .fetch files in case you like to edit your project deps locally.
14+
/.fetch
15+
16+
# If the VM crashes, it generates a dump, let's ignore it too.
17+
erl_crash.dump
18+
19+
# Also ignore archive artifacts (built via "mix archive.build").
20+
*.ez
21+
22+
# Ignore package tarball (built via "mix hex.build").
23+
csv2sql-*.tar
24+
25+
# linter
26+
/.elixir_ls/
27+
28+
# schema file
29+
schema.sql
30+
31+
# config file
32+
/config.env
33+
34+
# Formatting file
35+
.formatter.exs
36+
37+
# escipt binary
38+
csv2sql

apps/csv2sql/README.md

+288
Large diffs are not rendered by default.
File renamed without changes.

apps/csv2sql/lib/csv2sql.ex

+189
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
defmodule Csv2sql do
2+
def main(args) do
3+
Csv2sql.Helpers.greet()
4+
# Load configuration varaibles dynamically for escripts, this is required
5+
# since configuration variables are set to whatever they where when the
6+
# escript was build and cannot be changed later
7+
dashboard = update_config(args)
8+
9+
# Start supervision tree
10+
{:ok, sup_pid} = Csv2sql.Application.start(:no_args, :no_args)
11+
12+
# Wait for finish and stop supervion tree
13+
# This is done in separate Task to reply back to the caller(dashbaord GUI)
14+
# immediately after the supervision tree is started successfully
15+
Task.start(fn -> wait_for_finish(sup_pid) end)
16+
17+
# If error tracker server is not running, start it.
18+
# If block executes for first time when the app is started from "dashboard" app
19+
if !Process.whereis(:error_tracker), do: Csv2sql.ErrorTracker.start_link(:no_args)
20+
21+
# Regiter the main supervisor pid with error tracker
22+
# Error tracker will stop supervisor incase of errors
23+
Csv2sql.ErrorTracker.register_supervisor(sup_pid)
24+
25+
unless dashboard do
26+
# In escripts as soon as the main() function return, the escript ends,
27+
# this allows the escript to keep running, when the app is used without the dashboard.
28+
receive do
29+
{:wait} ->
30+
System.halt(0)
31+
end
32+
end
33+
34+
sup_pid
35+
end
36+
37+
defp wait_for_finish(sup_pid) do
38+
Csv2sql.Observer.get_stage()
39+
|> case do
40+
:error ->
41+
nil
42+
43+
:finish ->
44+
# Finish and stop supervisors after a second
45+
:timer.sleep(1000)
46+
Supervisor.stop(sup_pid)
47+
48+
_ ->
49+
wait_for_finish(sup_pid)
50+
end
51+
end
52+
53+
defp update_config(args) do
54+
{opts, _, _} =
55+
OptionParser.parse(args,
56+
strict: [
57+
dashboard: :boolean,
58+
schema_file_path: :string,
59+
source_csv_directory: :string,
60+
imported_csv_directory: :string,
61+
validated_csv_directory: :string,
62+
skip_make_schema: :boolean,
63+
skip_insert_schema: :boolean,
64+
skip_insert_data: :boolean,
65+
skip_validate_import: :boolean,
66+
db_connection_string: :string,
67+
connection_socket: :string,
68+
varchar_limit: :integer,
69+
schema_infer_chunk_size: :integer,
70+
worker_count: :integer,
71+
db_worker_count: :integer,
72+
insertion_chunk_size: :integer,
73+
job_count_limit: :integer,
74+
log: :string,
75+
timeout: :integer,
76+
connect_timeout: :integer,
77+
pool_size: :integer,
78+
queue_target: :integer,
79+
queue_interval: :integer
80+
]
81+
)
82+
83+
source_csv_directory = opts[:source_csv_directory] || "."
84+
schema_file_path = opts[:schema_file_path] || source_csv_directory
85+
imported_csv_directory = opts[:imported_csv_directory] || "#{source_csv_directory}/imported"
86+
87+
validated_csv_directory =
88+
opts[:validated_csv_directory] || "#{source_csv_directory}/validated"
89+
90+
make_schema = if opts[:skip_make_schema], do: false, else: true
91+
insert_schema = if opts[:skip_insert_schema], do: false, else: true
92+
insert_data = if opts[:skip_insert_data], do: false, else: true
93+
validate_import = if opts[:skip_validate_import], do: false, else: true
94+
95+
[db_type, username, password, host, database_name] =
96+
if opts[:db_connection_string] do
97+
str = opts[:db_connection_string]
98+
[db_type, username, tmp] = String.split(str, ":")
99+
[password, tmp] = String.split(tmp, "@")
100+
[host, database_name] = String.split(tmp, "/")
101+
[db_type, username, password, host, database_name]
102+
end
103+
104+
connection_socket = opts[:connection_socket] || "/var/run/mysqld/mysqld.sock"
105+
106+
varchar_limit = opts[:varchar_limit] || 100
107+
schema_infer_chunk_size = opts[:schema_infer_chunk_size] || 100
108+
worker_count = opts[:worker_count] || 10
109+
db_worker_count = opts[:db_worker_count] || 15
110+
insertion_chunk_size = opts[:insertion_chunk_size] || 100
111+
job_count_limit = opts[:job_count_limit] || 10
112+
log = if opts[:log], do: String.to_atom(opts[:log]), else: false
113+
timeout = opts[:timeout] || 60_000
114+
connect_timeout = opts[:connect_timeout] || 60_000
115+
pool_size = opts[:pool_size] || 20
116+
queue_target = opts[:queue_target] || 5000
117+
queue_interval = opts[:queue_interval] || 1000
118+
119+
repo_config = [
120+
username: username,
121+
password: password,
122+
host: host,
123+
insertion_chunk_size: insertion_chunk_size,
124+
job_count_limit: job_count_limit,
125+
log: log,
126+
timeout: timeout,
127+
connect_timeout: connect_timeout,
128+
pool_size: pool_size,
129+
queue_target: queue_target,
130+
queue_interval: queue_interval
131+
]
132+
133+
repo_config =
134+
if db_type == "postgres" do
135+
{Csv2sql.PostgreSQLRepo, repo_config ++ [database: database_name]}
136+
else
137+
{Csv2sql.MySQLRepo,
138+
repo_config ++
139+
[
140+
database_name: database_name,
141+
socket: connection_socket
142+
]}
143+
end
144+
145+
current_config = [
146+
csv2sql: [
147+
{Csv2sql.SchemaMaker,
148+
[
149+
varchar_limit: varchar_limit,
150+
schema_file_path: schema_file_path,
151+
schema_infer_chunk_size: schema_infer_chunk_size
152+
]},
153+
{Csv2sql.MainServer,
154+
[
155+
worker_count: worker_count,
156+
db_worker_count: db_worker_count,
157+
source_csv_directory: source_csv_directory,
158+
imported_csv_directory: imported_csv_directory,
159+
validated_csv_directory: validated_csv_directory,
160+
set_validate: validate_import,
161+
db_type: db_type
162+
]},
163+
{Csv2sql.Worker,
164+
[
165+
set_make_schema: make_schema,
166+
set_insert_schema: insert_schema,
167+
set_insert_data: insert_data
168+
]},
169+
repo_config
170+
]
171+
]
172+
173+
Application.put_all_env(current_config)
174+
175+
opts[:dashboard]
176+
end
177+
178+
def get_repo() do
179+
db_type = Application.get_env(:csv2sql, Csv2sql.MainServer)[:db_type]
180+
181+
if db_type == "postgres", do: Csv2sql.PostgreSQLRepo, else: Csv2sql.MySQLRepo
182+
end
183+
184+
def get_db_type() do
185+
if Application.get_env(:csv2sql, Csv2sql.MainServer)[:db_type] == "postgres",
186+
do: :postgres,
187+
else: :mysql
188+
end
189+
end
+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
defmodule Csv2sql.Application do
2+
use Application
3+
4+
def start(_type, _args) do
5+
repo_supervisor =
6+
if Application.get_env(:csv2sql, Csv2sql.MainServer)[:set_validate] ||
7+
Application.get_env(:csv2sql, Csv2sql.Worker)[:set_insert_schema] ||
8+
Application.get_env(:csv2sql, Csv2sql.Worker)[:set_insert_data],
9+
do: [Csv2sql.get_repo()],
10+
else: []
11+
12+
children =
13+
repo_supervisor ++
14+
[
15+
Csv2sql.Observer,
16+
Csv2sql.JobQueueServer,
17+
Csv2sql.DbWorkerSupervisor,
18+
Csv2sql.WorkerSupervisor,
19+
Csv2sql.MainServer
20+
]
21+
22+
opts = [strategy: :one_for_one, name: Csv2sql.Supervisor]
23+
Supervisor.start_link(children, opts)
24+
end
25+
end
+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
defmodule Csv2sql.DataTransfer do
2+
alias NimbleCSV.RFC4180, as: CSV
3+
alias Csv2sql.{JobQueueServer, Helpers}
4+
5+
@doc """
6+
Divides a csv file in chunks and place them in a job queue.
7+
Whenever a DB worker is free it will pick up a chunk from the queue
8+
and insert it in the database.
9+
"""
10+
def process_file(file) do
11+
Helpers.print_msg("Begin data tranfer for file: " <> Path.basename(file))
12+
13+
insertion_chunk_size = Application.get_env(:csv2sql, Csv2sql.get_repo())[:insertion_chunk_size]
14+
15+
file
16+
|> File.stream!()
17+
|> CSV.parse_stream()
18+
|> Stream.chunk_every(insertion_chunk_size)
19+
|> Enum.each(fn data_chunk ->
20+
check_job_queue(file, data_chunk)
21+
end)
22+
23+
wait_for_file_transfer(file)
24+
end
25+
26+
27+
# Wait until all chunks for the current file in the job queue has been processed
28+
# `:timer.sleep(300)` waits for the last chunk in queue to get inserted that is
29+
# if no, chunks were present on the job queue this means a DB worker has picked
30+
# up the chunk for insertion, so we wait for 300ms for the chunk to get inserted.
31+
defp wait_for_file_transfer(file) do
32+
if Csv2sql.JobQueueServer.job_for_file_present(file) do
33+
wait_for_file_transfer(file)
34+
else
35+
imported_csv_directory =
36+
Application.get_env(:csv2sql, Csv2sql.MainServer)[:imported_csv_directory]
37+
38+
:timer.sleep(300)
39+
File.rename(file, "#{imported_csv_directory}/#{Path.basename(file)}")
40+
Helpers.print_msg("Finished processing file: " <> Path.basename(file), :green)
41+
end
42+
end
43+
44+
45+
# Wait until job queue has space for the next chunk
46+
# by recursively calling itself.
47+
defp check_job_queue(file, data_chunk) do
48+
job_count_limit = Application.get_env(:csv2sql, Csv2sql.get_repo())[:job_count_limit]
49+
job_count = JobQueueServer.get_job_count()
50+
51+
if job_count > job_count_limit do
52+
check_job_queue(file, data_chunk)
53+
else
54+
JobQueueServer.add_data_chunk(file, data_chunk)
55+
end
56+
end
57+
end

0 commit comments

Comments
 (0)