Skip to content

Commit

Permalink
Initial open source commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Wayne Song committed Dec 30, 2014
0 parents commit 8218455
Show file tree
Hide file tree
Showing 88 changed files with 6,258 additions and 0 deletions.
Empty file added .empty
Empty file.
3 changes: 3 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[flake8]
exclude: .git,__pycache__,*.pyc,venv,distribution,memsql_loader/vendor
ignore: E121,E128,E201,E202,E221,E222,E241,E302,E4,E5,W292
30 changes: 30 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
*.py[cod]

# C extensions
*.so

# Packages
*.egg
*.egg-info
dist
build
eggs
parts
var
sdist
develop-eggs
.installed.cfg
lib
lib64
__pycache__

# Installer logs
pip-log.txt

# Vim
*.swp

venv
memsql_loader.log

memsql_loader.db*
15 changes: 15 additions & 0 deletions .pre-commit
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

# Redirect output to stderr.
exec 1>&2

make flake8
if [[ $? -ne 0 ]]; then
echo ""
echo "Flake8 detected syntax problems."
echo "Commit aborted."
exit 1
fi

# If there are whitespace errors, print the offending file names and fail.
exec git diff-index --check --cached HEAD --
3 changes: 3 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
2014-12-05 Version 2.0.0

* Initial open source release
21 changes: 21 additions & 0 deletions LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
The MIT License (MIT)

Copyright (c) 2014 MemSQL (http://www.memsql.com)

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
include *.txt
include *.md
90 changes: 90 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
##############################
# ENV
#
SHELL := /bin/bash

.PHONY: pre-check
pre-check:
@command -v virtualenv >/dev/null 2>&1 || { echo >&2 "$$DEPS_TEXT Missing package: virtualenv"; exit 1; }
@command -v curl-config --version >/dev/null 2>&1 || { echo >&2 "$$DEPS_TEXT Missing package: libcurl"; exit 1; }
@echo "int main(){}" | gcc -o /dev/null -x c - -lncurses 2>/dev/null || { echo >&2 "$$DEPS_TEXT Missing package: libncurses"; exit 1; }

.PHONY: deps
deps: pre-check venv/bin/activate .git/hooks/pre-commit
@source venv/bin/activate && ./scripts/apsw_install.sh

.PHONY: venv
venv: venv/bin/activate
venv/bin/activate: requirements.txt
test -d venv || virtualenv venv
. venv/bin/activate; easy_install readline
. venv/bin/activate; pip install -r requirements.txt
touch venv/bin/activate

.git/hooks/pre-commit: .pre-commit
@cp .pre-commit .git/hooks/pre-commit
@chmod +x .git/hooks/pre-commit

##############################
# HELPERS
#

.PHONY: todo
todo:
@ag "TODO" --ignore Makefile

.PHONY: flake8
flake8:
source venv/bin/activate; flake8 --config=.flake8 .

.PHONY: console
console:
source venv/bin/activate; ipython -i scripts/ipython.py

.PHONY: cloc
cloc:
cloc --by-file-by-lang bin memsql_loader setup.py

##############################
# BUILD
#
MEMSQL_LOADER_VERSION := $(shell python -c "import memsql_loader; print memsql_loader.__version__")
export MEMSQL_LOADER_VERSION

.PHONY: version
version:
@echo $(MEMSQL_LOADER_VERSION)

.PHONY: clean
clean:
-make -C distribution clean
rm -f logdict2.7.4.final*
rm -rf *.egg memsql_loader.egg-info dist build
python setup.py clean --all
for _kill_path in $$(find . -type f -name "*.pyc"); do rm -f $$_kill_path; done
for _kill_path in $$(find . -name "__pycache__"); do rm -rf $$_kill_path; done

distribution/dist/memsql-loader: distribution/memsql_loader.spec
make -C distribution build

.PHONY: build
build: clean distribution/dist/memsql-loader

.PHONY: release
release: distribution/dist/memsql-loader
git tag -f "$(MEMSQL_LOADER_VERSION)" && git push --tags
@sleep 1
-github-release info -u memsql -r memsql-loader
-github-release delete -u memsql -r memsql-loader \
--tag "$(MEMSQL_LOADER_VERSION)"
github-release release -u memsql -r memsql-loader \
--tag "$(MEMSQL_LOADER_VERSION)" \
--name "MemSQL Loader $(MEMSQL_LOADER_VERSION)" \
--description "$$(./scripts/latest_changes.py)" \
--draft
github-release upload -u memsql -r memsql-loader \
--tag "$(MEMSQL_LOADER_VERSION)" \
--name "memsql-loader" \
--file "distribution/dist/memsql-loader"
@echo "The release has been uploaded as a draft. View/Edit/Delete it here:"
@echo "https://github.com/memsql/memsql-loader/releases"
117 changes: 117 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
=============
MemSQL Loader
=============

MemSQL Loader is a tool that lets you load sets of files from Amazon S3, the Hadoop Distributed
File System (HDFS), and the local filesystem into MemSQL (or MySQL) with just one command. You can
specify all of the files you want to load with one command, and MemSQL Loader will take care of
deduplicating files, parallelizing the workload, retrying files if they fail to load, and more.

Background
==========

One of the most common tasks with any database is loading large amounts of data into
it from an external data store. Both MemSQL and MySQL provide the LOAD DATA command
for this task; this command is very powerful, but by itself, it has a number of restrictions:

* It can only read from the local filesystem, so loading data from a remote store like
Amazon S3 requires first downloading the files you need.
* Since it can only read from a single file at a time, if you want to load from multiple
files, you need to issue multiple LOAD DATA commands. If you want to perform this work
in parallel, you have to write your own scripts.
* If you are loading multiple files, it’s up to you to make sure that you’ve deduplicated
the files and their contents.

At MemSQL, we’ve acutely felt all of these limitations. That’s why we developed MemSQL Loader,
which solves all of the above problems and more.

Basic Usage
===========

Downloading the Loader
----------------------

The loader is a standalone binary that you can download and run directly. We keep the latest
version hosted at https://github.com/memsql/memsql-loader/releases. The binary is produced by compiling
this python project with PyInstaller.

You can download this repo and run the loader directly. If you do so, you will need to
install virtualenv, libcurl, and libncurses. Once you've downloaded the repo,
`cd` into its directory and run

$ source activate

You should see the prefix `(venv)` in your shell. You can run the loader with

(venv) $ ./bin/memsql-loader --help

Running the Loader
------------------

The primary interface to the loader is the `memsql-loader load` command. The command takes arguments
that specify the source, parsing options, and destination server. For example, to load some files
from S3, you can run

$ ./memsql-loader load -h 127.0.0.1 -u root --database db --table t \
s3://memsql-loader-examples/sanity/*

The loader automatically daemonizes and runs the load process in a background server. You can monitor its
progress with

$ ./memsql-loader ps --watch

If you would like to run this example against MemSQL or MySQL, run

memsql> CREATE DATABASE db;
memsql> CREATE TABLE db.t (a int, b int, primary key (a));

File Pattern Syntax
-------------------

The loader supports loading files from Amazon S3, HDFS, and the local filesystem. The file's prefix
determines the source. You can specify "s3://", "hdfs://", or "file://". If you omit the prefix,
then the loader defaults to the local filesystem.

The loader also supports glob syntax (with semantics similar to bash). A single `*` matches files
in the current directory, and `**` matches files recursively. MemSQL Loader uses the glob2 library
under the hood to facilitate this.

File Parsing Options
--------------------

MemSQL Loader's command line options mirror the LOAD DATA command's syntax. See the `load data options`
section in `./memsql-loader load --help` for reference.

Automatic Deduplication
-----------------------

MemSQL Loader is designed to support one-time loads as well as synchronizing behavior. You can use this
functionality to effectively sync a table's data to the set of files matching a path. The loader will automatically
deduplicate files that it knows it does not need to load (by using the MD5 feature on S3), and transactionally
delete and reload data when the contents of a file have changed.

NOTE: This reload behavior requires specifying a column to use as a `file_id`.

Spec Files
----------

We found with usage that it was really convenient to be able to define a load job as a JSON file, instead of
just command line options. MemSQL Loader lets you use "spec files" to accomplish this. To generate one, just
append `--print-spec` to the `./memsql-loader load` command. It will generate a spec file that you can
use with `--spec`. Any command line options that you provide along with `--spec` will override options
in the spec file.

TODO
====

* We have a pretty big test suite for the loader, but it's tied closely to MemSQL's internal testing
infrastructure. We're going to separate these tests out and add them to this repo.
* Right now the loader supports MemSQL and MySQL (via the LOAD DATA command), but does not support
other database systems. We would love for members of the community to add support for more systems.
* Error reporting and job management is fairly undeveloped in the loader. We'll integrate this further
into our MemSQL Ops platform over time, but it would be great to see some iteration on this here as well.

Third-party code
================
MemSQL Loader includes a fork of the python-glob2 project (https://github.com/miracle2k/python-glob2/).
The code for this fork can be found in [memsql_loader/vendor/glob2](memsql_loader/vendor/glob2).
4 changes: 4 additions & 0 deletions activate
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh
make deps
source venv/bin/activate
echo "You have been activated!"
16 changes: 16 additions & 0 deletions bin/memsql-loader
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env python

import os, sys
ROOT_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')
sys.path.append(ROOT_PATH)

from memsql_loader.util import config
from memsql_loader.db import connection_wrapper, pool

if __name__ == "__main__":
options = config.load_options()

try:
options.command(options)
finally:
pool.close_connections()
72 changes: 72 additions & 0 deletions distribution/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
FROM tianon/centos:5.8
MAINTAINER [email protected]

# dependencies
RUN yum install -y make wget gcc gcc-c++ bzip2 bzip2-devel zlib-devel openssl-devel mysql-devel which patch autoconf curl-devel expat-devel gettext-devel sqlite3-devel httpd-devel which libffi rpm-build

# install a newer version of ncurses
RUN wget -q --no-check-certificate -O /tmp/ncurses.tar.gz http://ftp.gnu.org/pub/gnu/ncurses/ncurses-5.9.tar.gz && \
cd /tmp && tar -xzf /tmp/ncurses.tar.gz && \
cd /tmp/ncurses-5.9/ && \
./configure --prefix=/usr/local --with-shared --without-debug --enable-pc-files --with-terminfo-dirs="/etc/terminfo:/lib/terminfo:/usr/share/terminfo" --with-default-terminfo-dir=/usr/share/terminfo && \
make -j 12 && make install

# install libyaml
WORKDIR /usr/src
RUN wget http://pyyaml.org/download/libyaml/yaml-0.1.4.tar.gz
RUN tar zxf yaml-0.1.4.tar.gz
WORKDIR yaml-0.1.4
RUN ./configure --prefix=/usr/local
RUN make -j 12
RUN make install

# install ruby 1.9.3
WORKDIR /usr/src
RUN wget -q --no-check-certificate http://ftp.ruby-lang.org/pub/ruby/1.9/ruby-1.9.3-p0.tar.gz
RUN tar zxf ruby-1.9.3-p0.tar.gz
WORKDIR /usr/src/ruby-1.9.3-p0
RUN ./configure --prefix=/usr/local --disable-install-doc
RUN make -j 12
RUN make install
WORKDIR /usr/src

# install fpm
RUN gem install fpm

# install python2.7.8
RUN wget -q --no-check-certificate -O /tmp/python-2.7.8.tar.gz http://www.python.org/ftp/python/2.7.8/Python-2.7.8.tgz && \
cd /tmp && tar -xzf /tmp/python-2.7.8.tar.gz && \
cd /tmp/Python-2.7.8/ && \
./configure --enable-shared --prefix=/usr/local CPPFLAGS="-I/usr/local/include -I/usr/local/include/ncurses" LDFLAGS=-Wl,-rpath=/usr/local/lib && \
make -j 12 && make altinstall

# install latest setuptools
RUN wget -q --no-check-certificate -O /tmp/ez_setup.py https://bitbucket.org/pypa/setuptools/raw/bootstrap/ez_setup.py && \
cd /tmp && /usr/local/bin/python2.7 /tmp/ez_setup.py --insecure

# install latest pip
RUN wget -q --no-check-certificate -O /tmp/get-pip.py https://raw.github.com/pypa/pip/master/contrib/get-pip.py && \
/usr/local/bin/python2.7 /tmp/get-pip.py

# install pip 1.5.6
RUN pip install pip==1.5.6

# install pyinstaller
RUN /usr/local/bin/pip2.7 install pyinstaller==2.1

# install latest libcurl
RUN cd /tmp && wget -q --no-check-certificate -O /tmp/curl.tar.gz http://curl.haxx.se/download/curl-7.37.0.tar.gz && \
tar -xzf curl.tar.gz && \
cd curl* && \
./configure && make -j 12 && make install

# install readline
RUN CPPFLAGS="-I/usr/local/include -I/usr/local/include/ncurses" LDFLAGS="-Wl,-rpath=/usr/local/lib,-rpath=/usr/local/include/ncurses" /usr/local/bin/easy_install-2.7 readline

# install git from source
RUN wget -q --no-check-certificate -O /tmp/git-2.2.0.tar.gz https://www.kernel.org/pub/software/scm/git/git-2.2.0.tar.gz && \
ls /tmp && cd /tmp && tar xzvf /tmp/git-2.2.0.tar.gz && \
cd /tmp/git-2.2.0 && \
./configure && \
make && \
make install
Loading

0 comments on commit 8218455

Please sign in to comment.