Breaking News

Important and useful Hive commands

Important  and useful Hive commands


--Starting the hive shell

hive


--listing databases

show databases;


--Running hive script

hive -f /home/username/jars/hive-script.sql


[No matter what method is used hive metastore is referenced for getting the metadata to connect to hive tables]


--to run any shell command from hive.

!clear 


--Running hdfs commands from hive shell

dfs -ls /


--quitting hive shell


quit;


==========================================================================

--Hive Databases


//when a database is created a .db file after the name of the database is created in the folder /user/hive/warehouse/


create database ecom_db;


//to create a dtabase checking if it already exists


create database if not exists ecom_db;


//to show databases

show databases;

//set current database as the database name 

use ecom_db;


//to drop the empty databse;


drop database rishi_db;


//to drop database with tables in it.


drop database rishi_db cascade;


--Checking the hive configuration parameters

--From cloudera manager

--From backend  at location  /etc/hive/conf-- check hive.metastore.location parameter

--Location in hadoop

/user/hive/warehouse/ecom_db.db


==========================================================


--HIVE tables


1. managed tables:[default]


--creating table without location clause 


--Create tables without location


create table orders (

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)

Row format delimited fields terminated by ','

Stored as textfile;


create table order_lines(

ord_item_id int,

ord_item_ord_id int,

ord_item_prd_id int,

ord_item_qty TINYINT,

ord_item_subttl FLOAT,

ord_item_prd_prc FLOAT

)

Row format delimited fields terminated by ','

Stored as textfile;


//describe table 


describe orders;


describe extended orders;


describe formatted orders;


=========================================================

WhiteMoose79@

--loading data into hive table 


--from local location: data gets copy pasted 


LOAD DATA local INPATH '/home/username/input/orders.csv' into table orders;


LOAD DATA local INPATH '/home/username/input/order_lines.csv' into table order_lines;


--hdfs location: data gets cut pasted


LOAD DATA  INPATH '/user/username/input/orders.csv' into table orders;


===========================================================


--creating table with location clause


create table orders_loc (

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)

row format delimited fields terminated by ','

stored as textfile

location '/user/username/input/orders_loc';


LOAD DATA local INPATH '/home/username/input/orders.csv' into table orders_loc;


============================================================


2.external tables:


--creating table without location clause 


create external table orders_ext (

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)

row format delimited fields terminated by ','

stored as textfile;


//describe table 


descrive formatted orders_ext;


=====================================================


--loading data into hive table 


--from local location: data gets copy pasted 


LOAD DATA local INPATH '/home/username/input/orders.csv' into table orders_ext;


--drop table - the data desnt get deleted.


drop table orders_ext;

===================================================


--creating external table with location clause


create external table orders_ext_loc (

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)

row format delimited fields terminated by ','

stored as textfile

location '/user/username/input/orders_ext_loc';


LOAD DATA local INPATH '/home/username/input/orders.csv' into table orders_ext_loc;

====================================================


--altering tables



ALTER TABLE orders_loc RENAME TO orders_loc_new;


ALTER TABLE orders_loc_new set location '/user/username/input/orders_loc_new'


LOAD DATA local INPATH  '/home/username/input/orders.csv' into table orders_loc_new;


ALTER TABLE orders_loc_new  ADD COLUMNS (price int);


ALTER TABLE orders_loc_new CHANGE price price float;


alter table orders_loc_new SET TBLPROPERTIES('EXTERNAL'='TRUE');


=====================================================


--overwriting the data in the hive table


LOAD DATA lOCAL INPATH '/home/username/input/orders.csv' OVERWRITE INTO TABLE orders_loc_new;


======================================================


--partitioned tables


set hive.exec.dynamic.partition.mode=nonstrict;-- as dynamic partitioning is not allowed in hadoop


create table part_orders(ord_id int, ord_date string, ord_cust_id int) PARTITIONED BY(ord_status string);


INSERT OVERWRITE TABLE part_orders PARTITION(ord_status)

SELECT ord_id, ord_date, ord_cust_id, ord_status from  orders;


select * from orders where ord_status='COMPLETE';


select * from part_orders where ord_status='COMPLETE';


--dynamicaly creating partitions in the table


insert into orders values (00034, "2014-07-23 00:00:00.0", 5333, "NPA")


INSERT OVERWRITE TABLE part_orders PARTITION(ord_status)

SELECT ord_id, ord_date, ord_cust_id, ord_status from  orders;



--multilevel partitioned tables


set hive.exec.dynamic.partition.mode=nonstrict;-- as dynamic partitioning is not allowed in hadoop


create table mult_part_orders(ord_date string, ord_cust_id int) PARTITIONED BY(ord_status string, ord_id int);


INSERT OVERWRITE TABLE mult_part_orders PARTITION(ord_status, ord_id)

SELECT  ord_date, ord_cust_id, ord_status,ord_id from  orders

where ord_id<=10;



=======================================================


--bucketing


set hive.enforce.bucketing = true;


create table bucket_orders (ord_id int, ord_date string, ord_cust_id int) PARTITIONED BY(ord_status string)

clustered by (ord_id) into 3 buckets row format delimited fields terminated by ',';


INSERT OVERWRITE TABLE bucket_orders PARTITION(ord_status)

SELECT ord_id, ord_date, ord_cust_id, ord_status from  orders;


select * from bucket_orders where ord_status='COMPLETE'

and ord_id=1;


=====================================================


--managing output


-writing data to local file


insert overwrite local directory '/home/username/output' Row format delimited 

fields terminated by '|' select * from orders


-writing data to directory in HDFS


insert overwrite directory '/user/username/output/h_out' Row format delimited 

fields terminated by '#' select * from orders


============================


--to check hive query plan also know as abstract sysntax plan


explain select * from orders;


===============================


joins in hive



--inner join:

select o.ord_cust_id, o.ord_id, o.ord_status,ol.ord_item_ord_id,ol.ord_item_qty,ol.ord_item_prd_prc,ol.ord_item_subttl from orders o join order_lines ol

on o.ord_id = ol.ord_item_ord_id limit 10;



--left outer join:


select o.ord_cust_id, o.ord_id, o.ord_status,ol.ord_item_ord_id,ol.ord_item_qty,ol.ord_item_prd_prc,ol.ord_item_subttl from orders o left outer join order_lines ol

on o.ord_id = ol.ord_item_ord_id limit 10;


select o.ord_cust_id, o.ord_id, o.ord_status,ol.ord_item_ord_id,ol.ord_item_qty,ol.ord_item_prd_prc,ol.ord_item_subttl from orders o left outer join order_lines ol

on o.ord_id = ol.ord_item_ord_id 

where ol.ord_item_ord_id is NULL

limit 10;



--right outer join:

select o.ord_cust_id, o.ord_id, o.ord_status,ol.ord_item_ord_id,ol.ord_item_qty,ol.ord_item_prd_prc,ol.ord_item_subttl from orders o right outer join order_lines ol

on o.ord_id = ol.ord_item_ord_id limit 10;



--full outer join:


select o.ord_cust_id, o.ord_id, o.ord_status,ol.ord_item_ord_id,ol.ord_item_qty,ol.ord_item_prd_prc,ol.ord_item_subttl from orders o full outer join order_lines ol

on o.ord_id = ol.ord_item_ord_id 

where ol.ord_item_ord_id is NULL

limit 10;




===============================


--views in Hive


CREATE VIEW completed_orders AS

SELECT * FROM orders 

Where ord_status='COMPLETE';



=======================================


Hive Beeline


beeline


!connect jdbc:hive2://hive.edu.cloudlab.com:10000 


=======================================


Hive UDFs


ADD jar /home/username/jars/HiveUdfs.jar;


create temporary function  r_lower as 'myudfs.lcase';


--- to see various functions available

show functions;


======================================================



--indexes in Hive


select ord_item_ord_id, sum(ord_item_subttl)

from order_lines 

group by ord_item_ord_id;



CREATE INDEX order_lines_index

ON TABLE order_lines (ord_item_ord_id)

AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'

WITH DEFERRED REBUILD;


Alter index order_lines_index on order_lines REBUILD;



select ord_item_ord_id, sum(ord_item_subttl)

from order_lines 

group by ord_item_ord_id;


CREATE INDEX order_lines_index_bitmap

ON TABLE order_lines (ord_item_ord_id)

AS 'BITMAP'

WITH DEFERRED REBUILD;


ALTER INDEX order_lines_index_bitmap on order_lines REBUILD;


show index on order_lines;


DROP INDEX IF EXISTS order_lines_index_bitmap ON order_lines;


DROP INDEX IF EXISTS order_lines_index ON order_lines;


====================================================




--compression techniques in Hive


set hive.exec.compress.output = true;

set mapreduce.output.fileoutputformat.compress=true;

set mapred.output.compression.codec= org.apache.hadoop.io.compress.BZip2Codec;


ï 4mc com.hadoop.compression.fourmc.FourMcCodec

ï gzip org.apache.hadoop.io.compress.GzipCodec

ï lzo com.hadoop.compression.lzo.LzopCodec

ï Snappy org.apache.hadoop.io.compress.SnappyCodec

ï bzip2 org.apache.hadoop.io.compress.BZip2Codec

ï lz4 org.apache.hadoop.io.compress.Lz4Codec



create table orders_compress ( line String)

row format delimited 

fields terminated by '/n' ;



LOAD DATA lOCAL INPATH '/home/username/input/orders.csv' OVERWRITE INTO TABLE orders_compress;


INSERT OVERWRITE DIRECTORY '/user/username/output/compressed_orders_data/' SELECT * FROM orders_compress;


dfs -ls /user/hive/warehouse/ecom_db.db/orders_compress


dfs  -ls /user/username/output/compressed_orders_data/


CREATE TABLE  compress_orders_demo_load( line String)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '/n' STORED AS TEXTFILE


LOAD DATA INPATH '/user/username/output/compressed_orders_data/*'  INTO TABLE compress_orders_demo_load;


select * from compress_orders_demo_load;



==========================================================


--file formats in Hive

1. RC file:


create table orders_rc( 

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)stored as RCFILE ;


insert into orders_rc

select * from orders;


CREATE TABLE orders_rc2 ( 

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)stored as RCFILE

location '/user/hive/warehouse/ecom_db.db/orders_rc/';


CREATE TABLE orders_rc3 ( 

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)stored as RCFILE;


LOAD DATA local INPATH '/home/username/input/orders.rc' overwrite into table orders_rc3;


LOAD DATA  local INPATH '/home/username/input/orders.csv' overwrite into table orders_rc2;


2. Avro file format:


create table orders_avro ( 

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)stored as AVRO ;


insert into orders_avro

select * from orders;


CREATE TABLE orders_avro2 ( 

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)stored as AVRO;


LOAD DATA local INPATH '/home/username/input/orders_avro_data.avro' into table orders_avro2;


3. ORC file format:


create table orders_orc (

Order_id int,

Order_date string,

Order_cust_id int,

Order_status string

)

Stored as ORC;


insert into orders_orc

select * from orders;


============================================================================


--Hive functions 


--substr


select substr("hi there how are you", 2,5);

select substr("hi there how are you", 2);

select substr("hi there how are you", -7,5);

select cast(substr(ord_date,6,2) as int) from orders limit 10;


--instr

select instr("hi there how are you", "there");




--split

select split("hi there how are you" ,'how');


--like

select "hi how are you" like 'are';

select "hi how are you" like '%are%';



--lcase or lower

select lcase("HELLO THERE");


--ucase or upper

select upper("hi there how are you");


--trim

select trim(' Hi there ');


--lpad and rpad


select lpad(12,4,'0');

===========================================================


--Date functions


--current_date

select current_date;


--Current_timestamp

select current_timestamp;


--Date_add

select date_add(current_date,3);


--Date_diff

select datediff(current_date,date_add(current_date,4));


--Date_format

select date_format(current_date,'y'); 


select date_format(ord_date,'YYYYMM') from orders limit 10;


--Day

select day(current_date);


--Todate

select current_timestamp,to_date(current_timestamp);


--To_unix_timestamp

select to_unix_timestamp(current_date);


--From_unixtime [ performance is better if the date is converted into number using this functio]

select from_unixtime(1599177634);


--Minute

select minute(current_timestamp);


--Month

select month(current_date);


--Months_between

select months_between(current_date,to_date('2021-09-21'));


--Next_day

select next_day(current_date,'MO');

===========================================================


--aggregation functions


--count


select ord_status, count(*) from orders

group by ord_status;



--sum


select o.ord_id,sum(ol.ord_item_subttl)  ord_revenue

from orders o join order_lines ol

on o.ord_id= ol.ord_item_ord_id

group by o.ord_id

having ord_revenue>1000;



--min

select min(ord_item_subttl) from orders_line;


--sorting data

select o.ord_date, sum(ol.ord_item_subttl)  daily_ord_revenue

from orders o join order_lines ol

on o.ord_id= ol.ord_item_ord_id

where o.ord_status in ('CLOSED','COMPLETE')

group by o.ord_date

order by o.ord_date;



=======================================================


--special functions

--case

select ord_id, ord_status, case ord_status

            when 'CLOSED' then 'OK'

            when 'COMPLETE' then 'OK'

            else 'NOT OK'

            end from orders;


select case 

            when ord_status in ('CLOSED','COMPLTE') then 'OK'

            else 'NOT OK'

            end from orders limit 10;


-nvl and subquery

select * from (

select nvl(ord_cust_id,'Missing status') as status from orders) as missing

where missing.status=='Missing status';


No comments