Important and useful Hive commands
Important and useful Hive commands
hive
--listing databases
show databases;
--Running hive script
hive -f /home/username/jars/hive-script.sql
[No matter what method is used hive metastore is referenced for getting the metadata to connect to hive tables]
--to run any shell command from hive.
!clear
--Running hdfs commands from hive shell
dfs -ls /
--quitting hive shell
quit;
==========================================================================
--Hive Databases
//when a database is created a .db file after the name of the database is created in the folder /user/hive/warehouse/
create database ecom_db;
//to create a dtabase checking if it already exists
create database if not exists ecom_db;
//to show databases
show databases;
//set current database as the database name
use ecom_db;
//to drop the empty databse;
drop database rishi_db;
//to drop database with tables in it.
drop database rishi_db cascade;
--Checking the hive configuration parameters
--From cloudera manager
--From backend at location /etc/hive/conf-- check hive.metastore.location parameter
--Location in hadoop
/user/hive/warehouse/ecom_db.db
==========================================================
--HIVE tables
1. managed tables:[default]
--creating table without location clause
--Create tables without location
create table orders (
ord_id int,
ord_date string,
ord_cust_id int,
ord_status string
)
Row format delimited fields terminated by ','
Stored as textfile;
create table order_lines(
ord_item_id int,
ord_item_ord_id int,
ord_item_prd_id int,
ord_item_qty TINYINT,
ord_item_subttl FLOAT,
ord_item_prd_prc FLOAT
)
Row format delimited fields terminated by ','
Stored as textfile;
//describe table
describe orders;
describe extended orders;
describe formatted orders;
=========================================================
WhiteMoose79@
--loading data into hive table
--from local location: data gets copy pasted
LOAD DATA local INPATH '/home/username/input/orders.csv' into table orders;
LOAD DATA local INPATH '/home/username/input/order_lines.csv' into table order_lines;
--hdfs location: data gets cut pasted
LOAD DATA INPATH '/user/username/input/orders.csv' into table orders;
===========================================================
--creating table with location clause
create table orders_loc (
ord_id int,
ord_date string,
ord_cust_id int,
ord_status string
)
row format delimited fields terminated by ','
stored as textfile
location '/user/username/input/orders_loc';
LOAD DATA local INPATH '/home/username/input/orders.csv' into table orders_loc;
============================================================
2.external tables:
--creating table without location clause
create external table orders_ext (
ord_id int,
ord_date string,
ord_cust_id int,
ord_status string
)
row format delimited fields terminated by ','
stored as textfile;
//describe table
descrive formatted orders_ext;
=====================================================
--loading data into hive table
--from local location: data gets copy pasted
LOAD DATA local INPATH '/home/username/input/orders.csv' into table orders_ext;
--drop table - the data desnt get deleted.
drop table orders_ext;
===================================================
--creating external table with location clause
create external table orders_ext_loc (
ord_id int,
ord_date string,
ord_cust_id int,
ord_status string
)
row format delimited fields terminated by ','
stored as textfile
location '/user/username/input/orders_ext_loc';
LOAD DATA local INPATH '/home/username/input/orders.csv' into table orders_ext_loc;
====================================================
--altering tables
ALTER TABLE orders_loc RENAME TO orders_loc_new;
ALTER TABLE orders_loc_new set location '/user/username/input/orders_loc_new'
LOAD DATA local INPATH '/home/username/input/orders.csv' into table orders_loc_new;
ALTER TABLE orders_loc_new ADD COLUMNS (price int);
ALTER TABLE orders_loc_new CHANGE price price float;
alter table orders_loc_new SET TBLPROPERTIES('EXTERNAL'='TRUE');
=====================================================
--overwriting the data in the hive table
LOAD DATA lOCAL INPATH '/home/username/input/orders.csv' OVERWRITE INTO TABLE orders_loc_new;
======================================================
--partitioned tables
set hive.exec.dynamic.partition.mode=nonstrict;-- as dynamic partitioning is not allowed in hadoop
create table part_orders(ord_id int, ord_date string, ord_cust_id int) PARTITIONED BY(ord_status string);
INSERT OVERWRITE TABLE part_orders PARTITION(ord_status)
SELECT ord_id, ord_date, ord_cust_id, ord_status from orders;
select * from orders where ord_status='COMPLETE';
select * from part_orders where ord_status='COMPLETE';
--dynamicaly creating partitions in the table
insert into orders values (00034, "2014-07-23 00:00:00.0", 5333, "NPA")
INSERT OVERWRITE TABLE part_orders PARTITION(ord_status)
SELECT ord_id, ord_date, ord_cust_id, ord_status from orders;
--multilevel partitioned tables
set hive.exec.dynamic.partition.mode=nonstrict;-- as dynamic partitioning is not allowed in hadoop
create table mult_part_orders(ord_date string, ord_cust_id int) PARTITIONED BY(ord_status string, ord_id int);
INSERT OVERWRITE TABLE mult_part_orders PARTITION(ord_status, ord_id)
SELECT ord_date, ord_cust_id, ord_status,ord_id from orders
where ord_id<=10;
=======================================================
--bucketing
set hive.enforce.bucketing = true;
create table bucket_orders (ord_id int, ord_date string, ord_cust_id int) PARTITIONED BY(ord_status string)
clustered by (ord_id) into 3 buckets row format delimited fields terminated by ',';
INSERT OVERWRITE TABLE bucket_orders PARTITION(ord_status)
SELECT ord_id, ord_date, ord_cust_id, ord_status from orders;
select * from bucket_orders where ord_status='COMPLETE'
and ord_id=1;
=====================================================
--managing output
-writing data to local file
insert overwrite local directory '/home/username/output' Row format delimited
fields terminated by '|' select * from orders
-writing data to directory in HDFS
insert overwrite directory '/user/username/output/h_out' Row format delimited
fields terminated by '#' select * from orders
============================
--to check hive query plan also know as abstract sysntax plan
explain select * from orders;
===============================
joins in hive
--inner join:
select o.ord_cust_id, o.ord_id, o.ord_status,ol.ord_item_ord_id,ol.ord_item_qty,ol.ord_item_prd_prc,ol.ord_item_subttl from orders o join order_lines ol
on o.ord_id = ol.ord_item_ord_id limit 10;
--left outer join:
select o.ord_cust_id, o.ord_id, o.ord_status,ol.ord_item_ord_id,ol.ord_item_qty,ol.ord_item_prd_prc,ol.ord_item_subttl from orders o left outer join order_lines ol
on o.ord_id = ol.ord_item_ord_id limit 10;
select o.ord_cust_id, o.ord_id, o.ord_status,ol.ord_item_ord_id,ol.ord_item_qty,ol.ord_item_prd_prc,ol.ord_item_subttl from orders o left outer join order_lines ol
on o.ord_id = ol.ord_item_ord_id
where ol.ord_item_ord_id is NULL
limit 10;
--right outer join:
select o.ord_cust_id, o.ord_id, o.ord_status,ol.ord_item_ord_id,ol.ord_item_qty,ol.ord_item_prd_prc,ol.ord_item_subttl from orders o right outer join order_lines ol
on o.ord_id = ol.ord_item_ord_id limit 10;
--full outer join:
select o.ord_cust_id, o.ord_id, o.ord_status,ol.ord_item_ord_id,ol.ord_item_qty,ol.ord_item_prd_prc,ol.ord_item_subttl from orders o full outer join order_lines ol
on o.ord_id = ol.ord_item_ord_id
where ol.ord_item_ord_id is NULL
limit 10;
===============================
--views in Hive
CREATE VIEW completed_orders AS
SELECT * FROM orders
Where ord_status='COMPLETE';
=======================================
Hive Beeline
beeline
!connect jdbc:hive2://hive.edu.cloudlab.com:10000
=======================================
Hive UDFs
ADD jar /home/username/jars/HiveUdfs.jar;
create temporary function r_lower as 'myudfs.lcase';
--- to see various functions available
show functions;
======================================================
--indexes in Hive
select ord_item_ord_id, sum(ord_item_subttl)
from order_lines
group by ord_item_ord_id;
CREATE INDEX order_lines_index
ON TABLE order_lines (ord_item_ord_id)
AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
WITH DEFERRED REBUILD;
Alter index order_lines_index on order_lines REBUILD;
select ord_item_ord_id, sum(ord_item_subttl)
from order_lines
group by ord_item_ord_id;
CREATE INDEX order_lines_index_bitmap
ON TABLE order_lines (ord_item_ord_id)
AS 'BITMAP'
WITH DEFERRED REBUILD;
ALTER INDEX order_lines_index_bitmap on order_lines REBUILD;
show index on order_lines;
DROP INDEX IF EXISTS order_lines_index_bitmap ON order_lines;
DROP INDEX IF EXISTS order_lines_index ON order_lines;
====================================================
--compression techniques in Hive
set hive.exec.compress.output = true;
set mapreduce.output.fileoutputformat.compress=true;
set mapred.output.compression.codec= org.apache.hadoop.io.compress.BZip2Codec;
ï 4mc com.hadoop.compression.fourmc.FourMcCodec
ï gzip org.apache.hadoop.io.compress.GzipCodec
ï lzo com.hadoop.compression.lzo.LzopCodec
ï Snappy org.apache.hadoop.io.compress.SnappyCodec
ï bzip2 org.apache.hadoop.io.compress.BZip2Codec
ï lz4 org.apache.hadoop.io.compress.Lz4Codec
create table orders_compress ( line String)
row format delimited
fields terminated by '/n' ;
LOAD DATA lOCAL INPATH '/home/username/input/orders.csv' OVERWRITE INTO TABLE orders_compress;
INSERT OVERWRITE DIRECTORY '/user/username/output/compressed_orders_data/' SELECT * FROM orders_compress;
dfs -ls /user/hive/warehouse/ecom_db.db/orders_compress
dfs -ls /user/username/output/compressed_orders_data/
CREATE TABLE compress_orders_demo_load( line String)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '/n' STORED AS TEXTFILE
LOAD DATA INPATH '/user/username/output/compressed_orders_data/*' INTO TABLE compress_orders_demo_load;
select * from compress_orders_demo_load;
==========================================================
--file formats in Hive
1. RC file:
create table orders_rc(
ord_id int,
ord_date string,
ord_cust_id int,
ord_status string
)stored as RCFILE ;
insert into orders_rc
select * from orders;
CREATE TABLE orders_rc2 (
ord_id int,
ord_date string,
ord_cust_id int,
ord_status string
)stored as RCFILE
location '/user/hive/warehouse/ecom_db.db/orders_rc/';
CREATE TABLE orders_rc3 (
ord_id int,
ord_date string,
ord_cust_id int,
ord_status string
)stored as RCFILE;
LOAD DATA local INPATH '/home/username/input/orders.rc' overwrite into table orders_rc3;
LOAD DATA local INPATH '/home/username/input/orders.csv' overwrite into table orders_rc2;
2. Avro file format:
create table orders_avro (
ord_id int,
ord_date string,
ord_cust_id int,
ord_status string
)stored as AVRO ;
insert into orders_avro
select * from orders;
CREATE TABLE orders_avro2 (
ord_id int,
ord_date string,
ord_cust_id int,
ord_status string
)stored as AVRO;
LOAD DATA local INPATH '/home/username/input/orders_avro_data.avro' into table orders_avro2;
3. ORC file format:
create table orders_orc (
Order_id int,
Order_date string,
Order_cust_id int,
Order_status string
)
Stored as ORC;
insert into orders_orc
select * from orders;
============================================================================
--Hive functions
--substr
select substr("hi there how are you", 2,5);
select substr("hi there how are you", 2);
select substr("hi there how are you", -7,5);
select cast(substr(ord_date,6,2) as int) from orders limit 10;
--instr
select instr("hi there how are you", "there");
--split
select split("hi there how are you" ,'how');
--like
select "hi how are you" like 'are';
select "hi how are you" like '%are%';
--lcase or lower
select lcase("HELLO THERE");
--ucase or upper
select upper("hi there how are you");
--trim
select trim(' Hi there ');
--lpad and rpad
select lpad(12,4,'0');
===========================================================
--Date functions
--current_date
select current_date;
--Current_timestamp
select current_timestamp;
--Date_add
select date_add(current_date,3);
--Date_diff
select datediff(current_date,date_add(current_date,4));
--Date_format
select date_format(current_date,'y');
select date_format(ord_date,'YYYYMM') from orders limit 10;
--Day
select day(current_date);
--Todate
select current_timestamp,to_date(current_timestamp);
--To_unix_timestamp
select to_unix_timestamp(current_date);
--From_unixtime [ performance is better if the date is converted into number using this functio]
select from_unixtime(1599177634);
--Minute
select minute(current_timestamp);
--Month
select month(current_date);
--Months_between
select months_between(current_date,to_date('2021-09-21'));
--Next_day
select next_day(current_date,'MO');
===========================================================
--aggregation functions
--count
select ord_status, count(*) from orders
group by ord_status;
--sum
select o.ord_id,sum(ol.ord_item_subttl) ord_revenue
from orders o join order_lines ol
on o.ord_id= ol.ord_item_ord_id
group by o.ord_id
having ord_revenue>1000;
--min
select min(ord_item_subttl) from orders_line;
--sorting data
select o.ord_date, sum(ol.ord_item_subttl) daily_ord_revenue
from orders o join order_lines ol
on o.ord_id= ol.ord_item_ord_id
where o.ord_status in ('CLOSED','COMPLETE')
group by o.ord_date
order by o.ord_date;
=======================================================
--special functions
--case
select ord_id, ord_status, case ord_status
when 'CLOSED' then 'OK'
when 'COMPLETE' then 'OK'
else 'NOT OK'
end from orders;
select case
when ord_status in ('CLOSED','COMPLTE') then 'OK'
else 'NOT OK'
end from orders limit 10;
-nvl and subquery
select * from (
select nvl(ord_cust_id,'Missing status') as status from orders) as missing
where missing.status=='Missing status';
No comments