-
Notifications
You must be signed in to change notification settings - Fork 14.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #418 from griffinqiu/vertica_hook
Add Vertica Database support for Airflow
- Loading branch information
Showing
10 changed files
with
198 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from vertica_python import connect | ||
|
||
from airflow.hooks.dbapi_hook import DbApiHook | ||
|
||
class VerticaHook(DbApiHook): | ||
''' | ||
Interact with Vertica. | ||
''' | ||
|
||
conn_name_attr = 'vertica_conn_id' | ||
default_conn_name = 'vertica_default' | ||
supports_autocommit = True | ||
|
||
def get_conn(self): | ||
""" | ||
Returns verticaql connection object | ||
""" | ||
conn = self.get_connection(self.vertica_conn_id) | ||
conn_config = { | ||
"user": conn.login, | ||
"password": conn.password, | ||
"database": conn.schema, | ||
} | ||
|
||
conn_config["host"] = conn.host or 'localhost' | ||
if not conn.port: | ||
conn_config["port"] = 5433 | ||
else: | ||
conn_config["port"] = int(conn.port) | ||
|
||
conn = connect(**conn_config) | ||
return conn |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import logging | ||
|
||
from airflow.hooks import VerticaHook | ||
from airflow.models import BaseOperator | ||
from airflow.utils import apply_defaults | ||
|
||
|
||
class VerticaOperator(BaseOperator): | ||
""" | ||
Executes sql code in a specific Vertica database | ||
:param vertica_conn_id: reference to a specific Vertica database | ||
:type vertica_conn_id: string | ||
:param sql: the sql code to be executed | ||
:type sql: Can receive a str representing a sql statement, | ||
a list of str (sql statements), or reference to a template file. | ||
Template reference are recognized by str ending in '.sql' | ||
""" | ||
|
||
template_fields = ('sql',) | ||
template_ext = ('.sql',) | ||
ui_color = '#b4e0ff' | ||
|
||
@apply_defaults | ||
def __init__(self, sql, vertica_conn_id='vertica_default', *args, **kwargs): | ||
super(VerticaOperator, self).__init__(*args, **kwargs) | ||
self.vertica_conn_id = vertica_conn_id | ||
self.sql = sql | ||
|
||
def execute(self, context): | ||
logging.info('Executing: ' + str(self.sql)) | ||
hook = VerticaHook(vertica_conn_id=self.vertica_conn_id) | ||
hook.run(self.sql) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
from builtins import chr | ||
from collections import OrderedDict | ||
import unicodecsv as csv | ||
import logging | ||
from tempfile import NamedTemporaryFile | ||
|
||
from airflow.hooks import HiveCliHook, VerticaHook | ||
from airflow.models import BaseOperator | ||
from airflow.utils import apply_defaults | ||
|
||
class VerticaToHiveTransfer(BaseOperator): | ||
""" | ||
Moves data from Vertia to Hive. The operator runs | ||
your query against Vertia, stores the file locally | ||
before loading it into a Hive table. If the ``create`` or | ||
``recreate`` arguments are set to ``True``, | ||
a ``CREATE TABLE`` and ``DROP TABLE`` statements are generated. | ||
Hive data types are inferred from the cursor's metadata. | ||
Note that the table generated in Hive uses ``STORED AS textfile`` | ||
which isn't the most efficient serialization format. If a | ||
large amount of data is loaded and/or if the table gets | ||
queried considerably, you may want to use this operator only to | ||
stage the data into a temporary table before loading it into its | ||
final destination using a ``HiveOperator``. | ||
:param sql: SQL query to execute against the Vertia database | ||
:type sql: str | ||
:param hive_table: target Hive table, use dot notation to target a | ||
specific database | ||
:type hive_table: str | ||
:param create: whether to create the table if it doesn't exist | ||
:type create: bool | ||
:param recreate: whether to drop and recreate the table at every execution | ||
:type recreate: bool | ||
:param partition: target partition as a dict of partition columns and values | ||
:type partition: dict | ||
:param delimiter: field delimiter in the file | ||
:type delimiter: str | ||
:param vertica_conn_id: source Vertica connection | ||
:type vertica_conn_id: str | ||
:param hive_conn_id: destination hive connection | ||
:type hive_conn_id: str | ||
""" | ||
|
||
template_fields = ('sql', 'partition', 'hive_table') | ||
template_ext = ('.sql',) | ||
ui_color = '#b4e0ff' | ||
|
||
@apply_defaults | ||
def __init__( | ||
self, | ||
sql, | ||
hive_table, | ||
create=True, | ||
recreate=False, | ||
partition=None, | ||
delimiter=chr(1), | ||
vertica_conn_id='vertica_default', | ||
hive_cli_conn_id='hive_cli_default', | ||
*args, **kwargs): | ||
super(VerticaToHiveTransfer, self).__init__(*args, **kwargs) | ||
self.sql = sql | ||
self.hive_table = hive_table | ||
self.partition = partition | ||
self.create = create | ||
self.recreate = recreate | ||
self.delimiter = str(delimiter) | ||
self.vertica_conn_id = vertica_conn_id | ||
self.hive_cli_conn_id = hive_cli_conn_id | ||
self.partition = partition or {} | ||
|
||
@classmethod | ||
def type_map(cls, vertica_type): | ||
# vertica-python datatype.py donot provied the full type mapping access. | ||
# Manual hack. Reference: https://github.com/uber/vertica-python/blob/master/vertica_python/vertica/column.py | ||
d = { | ||
5: 'BOOLEAN', | ||
6: 'INT', | ||
7: 'FLOAT', | ||
8: 'STRING', | ||
9: 'STRING', | ||
16: 'FLOAT', | ||
} | ||
return d[vertica_type] if vertica_type in d else 'STRING' | ||
|
||
def execute(self, context): | ||
hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) | ||
vertica = VerticaHook(vertica_conn_id=self.vertica_conn_id) | ||
|
||
logging.info("Dumping Vertica query results to local file") | ||
conn = vertica.get_conn() | ||
cursor = conn.cursor() | ||
cursor.execute(self.sql) | ||
with NamedTemporaryFile("w") as f: | ||
csv_writer = csv.writer(f, delimiter=self.delimiter, encoding='utf-8') | ||
field_dict = OrderedDict() | ||
col_count = 0 | ||
for field in cursor.description: | ||
col_count += 1 | ||
col_position = "Column{position}".format(position=col_count) | ||
field_dict[col_position if field[0] == '' else field[0]] = self.type_map(field[1]) | ||
csv_writer.writerows(cursor.iterate()) | ||
f.flush() | ||
cursor.close() | ||
conn.close() | ||
logging.info("Loading file into Hive") | ||
hive.load_file( | ||
f.name, | ||
self.hive_table, | ||
field_dict=field_dict, | ||
create=self.create, | ||
partition=self.partition, | ||
delimiter=self.delimiter, | ||
recreate=self.recreate) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,8 +27,9 @@ | |
slack = ['slackclient>=0.15'] | ||
crypto = ['cryptography>=0.9.3'] | ||
oracle = ['cx_Oracle>=5.1.2'] | ||
vertica = ['vertica-python>=0.5.1'] | ||
|
||
all_dbs = postgres + mysql + hive + mssql + hdfs | ||
all_dbs = postgres + mysql + hive + mssql + hdfs + vertica | ||
devel = all_dbs + doc + samba + s3 + ['nose'] + slack + crypto + oracle | ||
|
||
setup( | ||
|
@@ -80,6 +81,7 @@ | |
'slack': slack, | ||
'crypto': crypto, | ||
'oracle': oracle, | ||
'vertica': vertica, | ||
}, | ||
author='Maxime Beauchemin', | ||
author_email='[email protected]', | ||
|