Monday, 11 April 2016

Sqoop Import commands

sqoop commands:

sqoop list database  command
==================


sqoop list-databases \
  --connect "jdbc:mysql://quickstart.cloudera:3306" \
  --username retail_dba \
  --password cloudera

sqoop list tables command
=========================

sqoop list-tables \
 --connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
 --username retail_dba \
 --password cloudera

sqoop eval command
=========================

sqoop eval \
  --connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
  --username retail_dba \
  --password cloudera \
  --query "select count(1) from order_items"


sqoop import all tables commands
================================

sqoop import-all-tables \
   -m 12 \
  --connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
  --username=retail_dba \
  --password=cloudera \
  --warehouse-dir=/user/cloudera/sqoop_import_all

sqoop import individual tables commands
=======================================
--Default
sqoop import \
  --connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
  --username=retail_dba \
  --password=cloudera \
  --table departments \
  --as-textfile \
  --target-dir=/user/cloudera/departments

note : use below options for various file formats

  --as-avrodatafile \
  --as-sequencefile \
  --as-textfile \
  --as-parquetfile \

sqoop import delimitors
=======================
sqoop import \
  --connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
  --username=retail_dba \
  --password=cloudera \
  --table departments \
  --as-textfile \
  --target-dir=/user/cloudera/departments \
  --fields-terminated-by '|' \
  --lines-terminated-by '\n' \


sqoop boundary Query and columns
================================

sqoop import \
  --connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
  --username=retail_dba \
  --password=cloudera \
  --table departments \
  --target-dir /user/cloudera/departments \
  --boundary-query "select 2, 8 from departments" \
  --columns department_id,department_name

sqoop where clause
==================

sqoop import \
  --connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
  --username=retail_dba \
  --password=cloudera \
  --table departments \
  --target-dir /user/cloudera/departments \
  --where  "department_id > 4" \
  --columns department_id,department_name



sqoop join query and split by
=============================


sqoop import \
  --connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
  --username=retail_dba \
  --password=cloudera \
  --query="select * from orders join order_items on orders.order_id = order_items.order_item_order_id where \$CONDITIONS" \
  --target-dir /user/cloudera/order_join \
  --split-by order_id


note : sqoop look for the primary key to determine no of mappers. Hence primary key or split by key must be specified.


sqoop incremental load
======================

sqoop import \
  --connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
  --username=retail_dba \
  --password=cloudera \
  --table departments \
  --target-dir /user/cloudera/departments \
  --append \
  --fields-terminated-by '|' \
  --lines-terminated-by '\n' \
  --check-column "department_id" \
  --incremental append \
  --last-value 7 \


note : 2 modes are present for incremental load
1. append
2. last modified

HIVE Related sqoop commands
===========================

Loading avro formatted dataset to hive.

1. sqoop command to generate avro dataset from mysql

sqoop import \
  --connect "jdbc:mysql://quickstart.cloudera:3306/retail_db" \
  --username=retail_dba \
  --password=cloudera \
  --table departments \
  --as-avrodatafile \
  --target-dir=/user/cloudera/departments

note : an Avro schema file will be generated in the path where sqoop import runs.

2. copy the schema generated to hadoop

fs -put departments.avsc /user/cloudera

3. Create Hive table specifying data set location and avro serde

CREATE EXTERNAL TABLE departments
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION 'hdfs://quickstart.cloudera/user/cloudera/departments'
TBLPROPERTIES ('avro.schema.url'='hdfs://quickstart.cloudera/user/cloudera/departments.avsc');


No comments:

Post a Comment