-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Wayne Song
committed
Dec 30, 2014
0 parents
commit 8218455
Showing
88 changed files
with
6,258 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[flake8] | ||
exclude: .git,__pycache__,*.pyc,venv,distribution,memsql_loader/vendor | ||
ignore: E121,E128,E201,E202,E221,E222,E241,E302,E4,E5,W292 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
*.py[cod] | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Packages | ||
*.egg | ||
*.egg-info | ||
dist | ||
build | ||
eggs | ||
parts | ||
var | ||
sdist | ||
develop-eggs | ||
.installed.cfg | ||
lib | ||
lib64 | ||
__pycache__ | ||
|
||
# Installer logs | ||
pip-log.txt | ||
|
||
# Vim | ||
*.swp | ||
|
||
venv | ||
memsql_loader.log | ||
|
||
memsql_loader.db* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#!/bin/bash | ||
|
||
# Redirect output to stderr. | ||
exec 1>&2 | ||
|
||
make flake8 | ||
if [[ $? -ne 0 ]]; then | ||
echo "" | ||
echo "Flake8 detected syntax problems." | ||
echo "Commit aborted." | ||
exit 1 | ||
fi | ||
|
||
# If there are whitespace errors, print the offending file names and fail. | ||
exec git diff-index --check --cached HEAD -- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
2014-12-05 Version 2.0.0 | ||
|
||
* Initial open source release |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
The MIT License (MIT) | ||
|
||
Copyright (c) 2014 MemSQL (http://www.memsql.com) | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
include *.txt | ||
include *.md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
############################## | ||
# ENV | ||
# | ||
SHELL := /bin/bash | ||
|
||
.PHONY: pre-check | ||
pre-check: | ||
@command -v virtualenv >/dev/null 2>&1 || { echo >&2 "$$DEPS_TEXT Missing package: virtualenv"; exit 1; } | ||
@command -v curl-config --version >/dev/null 2>&1 || { echo >&2 "$$DEPS_TEXT Missing package: libcurl"; exit 1; } | ||
@echo "int main(){}" | gcc -o /dev/null -x c - -lncurses 2>/dev/null || { echo >&2 "$$DEPS_TEXT Missing package: libncurses"; exit 1; } | ||
|
||
.PHONY: deps | ||
deps: pre-check venv/bin/activate .git/hooks/pre-commit | ||
@source venv/bin/activate && ./scripts/apsw_install.sh | ||
|
||
.PHONY: venv | ||
venv: venv/bin/activate | ||
venv/bin/activate: requirements.txt | ||
test -d venv || virtualenv venv | ||
. venv/bin/activate; easy_install readline | ||
. venv/bin/activate; pip install -r requirements.txt | ||
touch venv/bin/activate | ||
|
||
.git/hooks/pre-commit: .pre-commit | ||
@cp .pre-commit .git/hooks/pre-commit | ||
@chmod +x .git/hooks/pre-commit | ||
|
||
############################## | ||
# HELPERS | ||
# | ||
|
||
.PHONY: todo | ||
todo: | ||
@ag "TODO" --ignore Makefile | ||
|
||
.PHONY: flake8 | ||
flake8: | ||
source venv/bin/activate; flake8 --config=.flake8 . | ||
|
||
.PHONY: console | ||
console: | ||
source venv/bin/activate; ipython -i scripts/ipython.py | ||
|
||
.PHONY: cloc | ||
cloc: | ||
cloc --by-file-by-lang bin memsql_loader setup.py | ||
|
||
############################## | ||
# BUILD | ||
# | ||
MEMSQL_LOADER_VERSION := $(shell python -c "import memsql_loader; print memsql_loader.__version__") | ||
export MEMSQL_LOADER_VERSION | ||
|
||
.PHONY: version | ||
version: | ||
@echo $(MEMSQL_LOADER_VERSION) | ||
|
||
.PHONY: clean | ||
clean: | ||
-make -C distribution clean | ||
rm -f logdict2.7.4.final* | ||
rm -rf *.egg memsql_loader.egg-info dist build | ||
python setup.py clean --all | ||
for _kill_path in $$(find . -type f -name "*.pyc"); do rm -f $$_kill_path; done | ||
for _kill_path in $$(find . -name "__pycache__"); do rm -rf $$_kill_path; done | ||
|
||
distribution/dist/memsql-loader: distribution/memsql_loader.spec | ||
make -C distribution build | ||
|
||
.PHONY: build | ||
build: clean distribution/dist/memsql-loader | ||
|
||
.PHONY: release | ||
release: distribution/dist/memsql-loader | ||
git tag -f "$(MEMSQL_LOADER_VERSION)" && git push --tags | ||
@sleep 1 | ||
-github-release info -u memsql -r memsql-loader | ||
-github-release delete -u memsql -r memsql-loader \ | ||
--tag "$(MEMSQL_LOADER_VERSION)" | ||
github-release release -u memsql -r memsql-loader \ | ||
--tag "$(MEMSQL_LOADER_VERSION)" \ | ||
--name "MemSQL Loader $(MEMSQL_LOADER_VERSION)" \ | ||
--description "$$(./scripts/latest_changes.py)" \ | ||
--draft | ||
github-release upload -u memsql -r memsql-loader \ | ||
--tag "$(MEMSQL_LOADER_VERSION)" \ | ||
--name "memsql-loader" \ | ||
--file "distribution/dist/memsql-loader" | ||
@echo "The release has been uploaded as a draft. View/Edit/Delete it here:" | ||
@echo "https://github.com/memsql/memsql-loader/releases" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
============= | ||
MemSQL Loader | ||
============= | ||
|
||
MemSQL Loader is a tool that lets you load sets of files from Amazon S3, the Hadoop Distributed | ||
File System (HDFS), and the local filesystem into MemSQL (or MySQL) with just one command. You can | ||
specify all of the files you want to load with one command, and MemSQL Loader will take care of | ||
deduplicating files, parallelizing the workload, retrying files if they fail to load, and more. | ||
|
||
Background | ||
========== | ||
|
||
One of the most common tasks with any database is loading large amounts of data into | ||
it from an external data store. Both MemSQL and MySQL provide the LOAD DATA command | ||
for this task; this command is very powerful, but by itself, it has a number of restrictions: | ||
|
||
* It can only read from the local filesystem, so loading data from a remote store like | ||
Amazon S3 requires first downloading the files you need. | ||
* Since it can only read from a single file at a time, if you want to load from multiple | ||
files, you need to issue multiple LOAD DATA commands. If you want to perform this work | ||
in parallel, you have to write your own scripts. | ||
* If you are loading multiple files, it’s up to you to make sure that you’ve deduplicated | ||
the files and their contents. | ||
|
||
At MemSQL, we’ve acutely felt all of these limitations. That’s why we developed MemSQL Loader, | ||
which solves all of the above problems and more. | ||
|
||
Basic Usage | ||
=========== | ||
|
||
Downloading the Loader | ||
---------------------- | ||
|
||
The loader is a standalone binary that you can download and run directly. We keep the latest | ||
version hosted at https://github.com/memsql/memsql-loader/releases. The binary is produced by compiling | ||
this python project with PyInstaller. | ||
|
||
You can download this repo and run the loader directly. If you do so, you will need to | ||
install virtualenv, libcurl, and libncurses. Once you've downloaded the repo, | ||
`cd` into its directory and run | ||
|
||
$ source activate | ||
|
||
You should see the prefix `(venv)` in your shell. You can run the loader with | ||
|
||
(venv) $ ./bin/memsql-loader --help | ||
|
||
Running the Loader | ||
------------------ | ||
|
||
The primary interface to the loader is the `memsql-loader load` command. The command takes arguments | ||
that specify the source, parsing options, and destination server. For example, to load some files | ||
from S3, you can run | ||
|
||
$ ./memsql-loader load -h 127.0.0.1 -u root --database db --table t \ | ||
s3://memsql-loader-examples/sanity/* | ||
|
||
The loader automatically daemonizes and runs the load process in a background server. You can monitor its | ||
progress with | ||
|
||
$ ./memsql-loader ps --watch | ||
|
||
If you would like to run this example against MemSQL or MySQL, run | ||
|
||
memsql> CREATE DATABASE db; | ||
memsql> CREATE TABLE db.t (a int, b int, primary key (a)); | ||
|
||
File Pattern Syntax | ||
------------------- | ||
|
||
The loader supports loading files from Amazon S3, HDFS, and the local filesystem. The file's prefix | ||
determines the source. You can specify "s3://", "hdfs://", or "file://". If you omit the prefix, | ||
then the loader defaults to the local filesystem. | ||
|
||
The loader also supports glob syntax (with semantics similar to bash). A single `*` matches files | ||
in the current directory, and `**` matches files recursively. MemSQL Loader uses the glob2 library | ||
under the hood to facilitate this. | ||
|
||
File Parsing Options | ||
-------------------- | ||
|
||
MemSQL Loader's command line options mirror the LOAD DATA command's syntax. See the `load data options` | ||
section in `./memsql-loader load --help` for reference. | ||
|
||
Automatic Deduplication | ||
----------------------- | ||
|
||
MemSQL Loader is designed to support one-time loads as well as synchronizing behavior. You can use this | ||
functionality to effectively sync a table's data to the set of files matching a path. The loader will automatically | ||
deduplicate files that it knows it does not need to load (by using the MD5 feature on S3), and transactionally | ||
delete and reload data when the contents of a file have changed. | ||
|
||
NOTE: This reload behavior requires specifying a column to use as a `file_id`. | ||
|
||
Spec Files | ||
---------- | ||
|
||
We found with usage that it was really convenient to be able to define a load job as a JSON file, instead of | ||
just command line options. MemSQL Loader lets you use "spec files" to accomplish this. To generate one, just | ||
append `--print-spec` to the `./memsql-loader load` command. It will generate a spec file that you can | ||
use with `--spec`. Any command line options that you provide along with `--spec` will override options | ||
in the spec file. | ||
|
||
TODO | ||
==== | ||
|
||
* We have a pretty big test suite for the loader, but it's tied closely to MemSQL's internal testing | ||
infrastructure. We're going to separate these tests out and add them to this repo. | ||
* Right now the loader supports MemSQL and MySQL (via the LOAD DATA command), but does not support | ||
other database systems. We would love for members of the community to add support for more systems. | ||
* Error reporting and job management is fairly undeveloped in the loader. We'll integrate this further | ||
into our MemSQL Ops platform over time, but it would be great to see some iteration on this here as well. | ||
|
||
Third-party code | ||
================ | ||
MemSQL Loader includes a fork of the python-glob2 project (https://github.com/miracle2k/python-glob2/). | ||
The code for this fork can be found in [memsql_loader/vendor/glob2](memsql_loader/vendor/glob2). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#!/bin/sh | ||
make deps | ||
source venv/bin/activate | ||
echo "You have been activated!" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/usr/bin/env python | ||
|
||
import os, sys | ||
ROOT_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..') | ||
sys.path.append(ROOT_PATH) | ||
|
||
from memsql_loader.util import config | ||
from memsql_loader.db import connection_wrapper, pool | ||
|
||
if __name__ == "__main__": | ||
options = config.load_options() | ||
|
||
try: | ||
options.command(options) | ||
finally: | ||
pool.close_connections() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
FROM tianon/centos:5.8 | ||
MAINTAINER [email protected] | ||
|
||
# dependencies | ||
RUN yum install -y make wget gcc gcc-c++ bzip2 bzip2-devel zlib-devel openssl-devel mysql-devel which patch autoconf curl-devel expat-devel gettext-devel sqlite3-devel httpd-devel which libffi rpm-build | ||
|
||
# install a newer version of ncurses | ||
RUN wget -q --no-check-certificate -O /tmp/ncurses.tar.gz http://ftp.gnu.org/pub/gnu/ncurses/ncurses-5.9.tar.gz && \ | ||
cd /tmp && tar -xzf /tmp/ncurses.tar.gz && \ | ||
cd /tmp/ncurses-5.9/ && \ | ||
./configure --prefix=/usr/local --with-shared --without-debug --enable-pc-files --with-terminfo-dirs="/etc/terminfo:/lib/terminfo:/usr/share/terminfo" --with-default-terminfo-dir=/usr/share/terminfo && \ | ||
make -j 12 && make install | ||
|
||
# install libyaml | ||
WORKDIR /usr/src | ||
RUN wget http://pyyaml.org/download/libyaml/yaml-0.1.4.tar.gz | ||
RUN tar zxf yaml-0.1.4.tar.gz | ||
WORKDIR yaml-0.1.4 | ||
RUN ./configure --prefix=/usr/local | ||
RUN make -j 12 | ||
RUN make install | ||
|
||
# install ruby 1.9.3 | ||
WORKDIR /usr/src | ||
RUN wget -q --no-check-certificate http://ftp.ruby-lang.org/pub/ruby/1.9/ruby-1.9.3-p0.tar.gz | ||
RUN tar zxf ruby-1.9.3-p0.tar.gz | ||
WORKDIR /usr/src/ruby-1.9.3-p0 | ||
RUN ./configure --prefix=/usr/local --disable-install-doc | ||
RUN make -j 12 | ||
RUN make install | ||
WORKDIR /usr/src | ||
|
||
# install fpm | ||
RUN gem install fpm | ||
|
||
# install python2.7.8 | ||
RUN wget -q --no-check-certificate -O /tmp/python-2.7.8.tar.gz http://www.python.org/ftp/python/2.7.8/Python-2.7.8.tgz && \ | ||
cd /tmp && tar -xzf /tmp/python-2.7.8.tar.gz && \ | ||
cd /tmp/Python-2.7.8/ && \ | ||
./configure --enable-shared --prefix=/usr/local CPPFLAGS="-I/usr/local/include -I/usr/local/include/ncurses" LDFLAGS=-Wl,-rpath=/usr/local/lib && \ | ||
make -j 12 && make altinstall | ||
|
||
# install latest setuptools | ||
RUN wget -q --no-check-certificate -O /tmp/ez_setup.py https://bitbucket.org/pypa/setuptools/raw/bootstrap/ez_setup.py && \ | ||
cd /tmp && /usr/local/bin/python2.7 /tmp/ez_setup.py --insecure | ||
|
||
# install latest pip | ||
RUN wget -q --no-check-certificate -O /tmp/get-pip.py https://raw.github.com/pypa/pip/master/contrib/get-pip.py && \ | ||
/usr/local/bin/python2.7 /tmp/get-pip.py | ||
|
||
# install pip 1.5.6 | ||
RUN pip install pip==1.5.6 | ||
|
||
# install pyinstaller | ||
RUN /usr/local/bin/pip2.7 install pyinstaller==2.1 | ||
|
||
# install latest libcurl | ||
RUN cd /tmp && wget -q --no-check-certificate -O /tmp/curl.tar.gz http://curl.haxx.se/download/curl-7.37.0.tar.gz && \ | ||
tar -xzf curl.tar.gz && \ | ||
cd curl* && \ | ||
./configure && make -j 12 && make install | ||
|
||
# install readline | ||
RUN CPPFLAGS="-I/usr/local/include -I/usr/local/include/ncurses" LDFLAGS="-Wl,-rpath=/usr/local/lib,-rpath=/usr/local/include/ncurses" /usr/local/bin/easy_install-2.7 readline | ||
|
||
# install git from source | ||
RUN wget -q --no-check-certificate -O /tmp/git-2.2.0.tar.gz https://www.kernel.org/pub/software/scm/git/git-2.2.0.tar.gz && \ | ||
ls /tmp && cd /tmp && tar xzvf /tmp/git-2.2.0.tar.gz && \ | ||
cd /tmp/git-2.2.0 && \ | ||
./configure && \ | ||
make && \ | ||
make install |
Oops, something went wrong.