Important and useful Hive commands

--Starting the hive shell

hive

--listing databases

show databases;

--Running hive script

hive -f /home/username/jars/hive-script.sql

[No matter what method is used hive metastore is referenced for getting the metadata to connect to hive tables]

--to run any shell command from hive.

!clear

--Running hdfs commands from hive shell

dfs -ls /

--quitting hive shell

quit;

==========================================================================

--Hive Databases

//when a database is created a .db file after the name of the database is created in the folder /user/hive/warehouse/

create database ecom_db;

//to create a dtabase checking if it already exists

create database if not exists ecom_db;

//to show databases

show databases;

//set current database as the database name

use ecom_db;

//to drop the empty databse;

drop database rishi_db;

//to drop database with tables in it.

drop database rishi_db cascade;

--Checking the hive configuration parameters

--From cloudera manager

--From backend at location /etc/hive/conf-- check hive.metastore.location parameter

--Location in hadoop

/user/hive/warehouse/ecom_db.db

==========================================================

--HIVE tables

1. managed tables:[default]

--creating table without location clause

--Create tables without location

create table orders (

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)

Row format delimited fields terminated by ','

Stored as textfile;

create table order_lines(

ord_item_id int,

ord_item_ord_id int,

ord_item_prd_id int,

ord_item_qty TINYINT,

ord_item_subttl FLOAT,

ord_item_prd_prc FLOAT

)

Row format delimited fields terminated by ','

Stored as textfile;

//describe table

describe orders;

describe extended orders;

describe formatted orders;

=========================================================

WhiteMoose79@

--loading data into hive table

--from local location: data gets copy pasted

LOAD DATA local INPATH '/home/username/input/orders.csv' into table orders;

LOAD DATA local INPATH '/home/username/input/order_lines.csv' into table order_lines;

--hdfs location: data gets cut pasted

LOAD DATA INPATH '/user/username/input/orders.csv' into table orders;

===========================================================

--creating table with location clause

create table orders_loc (

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)

row format delimited fields terminated by ','

stored as textfile

location '/user/username/input/orders_loc';

LOAD DATA local INPATH '/home/username/input/orders.csv' into table orders_loc;

============================================================

2.external tables:

--creating table without location clause

create external table orders_ext (

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)

row format delimited fields terminated by ','

stored as textfile;

//describe table

descrive formatted orders_ext;

=====================================================

--loading data into hive table

--from local location: data gets copy pasted

LOAD DATA local INPATH '/home/username/input/orders.csv' into table orders_ext;

--drop table - the data desnt get deleted.

drop table orders_ext;

===================================================

--creating external table with location clause

create external table orders_ext_loc (

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)

row format delimited fields terminated by ','

stored as textfile

location '/user/username/input/orders_ext_loc';

LOAD DATA local INPATH '/home/username/input/orders.csv' into table orders_ext_loc;

====================================================

--altering tables

ALTER TABLE orders_loc RENAME TO orders_loc_new;

ALTER TABLE orders_loc_new set location '/user/username/input/orders_loc_new'

LOAD DATA local INPATH '/home/username/input/orders.csv' into table orders_loc_new;

ALTER TABLE orders_loc_new ADD COLUMNS (price int);

ALTER TABLE orders_loc_new CHANGE price price float;

alter table orders_loc_new SET TBLPROPERTIES('EXTERNAL'='TRUE');

=====================================================

--overwriting the data in the hive table

LOAD DATA lOCAL INPATH '/home/username/input/orders.csv' OVERWRITE INTO TABLE orders_loc_new;

======================================================

--partitioned tables

set hive.exec.dynamic.partition.mode=nonstrict;-- as dynamic partitioning is not allowed in hadoop

create table part_orders(ord_id int, ord_date string, ord_cust_id int) PARTITIONED BY(ord_status string);

INSERT OVERWRITE TABLE part_orders PARTITION(ord_status)

SELECT ord_id, ord_date, ord_cust_id, ord_status from orders;

select * from orders where ord_status='COMPLETE';

select * from part_orders where ord_status='COMPLETE';

--dynamicaly creating partitions in the table

insert into orders values (00034, "2014-07-23 00:00:00.0", 5333, "NPA")

INSERT OVERWRITE TABLE part_orders PARTITION(ord_status)

SELECT ord_id, ord_date, ord_cust_id, ord_status from orders;

--multilevel partitioned tables

set hive.exec.dynamic.partition.mode=nonstrict;-- as dynamic partitioning is not allowed in hadoop

create table mult_part_orders(ord_date string, ord_cust_id int) PARTITIONED BY(ord_status string, ord_id int);

INSERT OVERWRITE TABLE mult_part_orders PARTITION(ord_status, ord_id)

SELECT ord_date, ord_cust_id, ord_status,ord_id from orders

where ord_id<=10;

=======================================================

--bucketing

set hive.enforce.bucketing = true;

create table bucket_orders (ord_id int, ord_date string, ord_cust_id int) PARTITIONED BY(ord_status string)

clustered by (ord_id) into 3 buckets row format delimited fields terminated by ',';

INSERT OVERWRITE TABLE bucket_orders PARTITION(ord_status)

SELECT ord_id, ord_date, ord_cust_id, ord_status from orders;

select * from bucket_orders where ord_status='COMPLETE'

and ord_id=1;

=====================================================

--managing output

-writing data to local file

insert overwrite local directory '/home/username/output' Row format delimited

fields terminated by '|' select * from orders

-writing data to directory in HDFS

insert overwrite directory '/user/username/output/h_out' Row format delimited

fields terminated by '#' select * from orders

============================

--to check hive query plan also know as abstract sysntax plan

explain select * from orders;

===============================

joins in hive

--inner join:

select o.ord_cust_id, o.ord_id, o.ord_status,ol.ord_item_ord_id,ol.ord_item_qty,ol.ord_item_prd_prc,ol.ord_item_subttl from orders o join order_lines ol

on o.ord_id = ol.ord_item_ord_id limit 10;

--left outer join:

select o.ord_cust_id, o.ord_id, o.ord_status,ol.ord_item_ord_id,ol.ord_item_qty,ol.ord_item_prd_prc,ol.ord_item_subttl from orders o left outer join order_lines ol

on o.ord_id = ol.ord_item_ord_id limit 10;

select o.ord_cust_id, o.ord_id, o.ord_status,ol.ord_item_ord_id,ol.ord_item_qty,ol.ord_item_prd_prc,ol.ord_item_subttl from orders o left outer join order_lines ol

on o.ord_id = ol.ord_item_ord_id

where ol.ord_item_ord_id is NULL

limit 10;

--right outer join:

select o.ord_cust_id, o.ord_id, o.ord_status,ol.ord_item_ord_id,ol.ord_item_qty,ol.ord_item_prd_prc,ol.ord_item_subttl from orders o right outer join order_lines ol

on o.ord_id = ol.ord_item_ord_id limit 10;

--full outer join:

select o.ord_cust_id, o.ord_id, o.ord_status,ol.ord_item_ord_id,ol.ord_item_qty,ol.ord_item_prd_prc,ol.ord_item_subttl from orders o full outer join order_lines ol

on o.ord_id = ol.ord_item_ord_id

where ol.ord_item_ord_id is NULL

limit 10;

===============================

--views in Hive

CREATE VIEW completed_orders AS

SELECT * FROM orders

Where ord_status='COMPLETE';

=======================================

Hive Beeline

beeline

!connect jdbc:hive2://hive.edu.cloudlab.com:10000

=======================================

Hive UDFs

ADD jar /home/username/jars/HiveUdfs.jar;

create temporary function r_lower as 'myudfs.lcase';

--- to see various functions available

show functions;

======================================================

--indexes in Hive

select ord_item_ord_id, sum(ord_item_subttl)

from order_lines

group by ord_item_ord_id;

CREATE INDEX order_lines_index

ON TABLE order_lines (ord_item_ord_id)

AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'

WITH DEFERRED REBUILD;

Alter index order_lines_index on order_lines REBUILD;

select ord_item_ord_id, sum(ord_item_subttl)

from order_lines

group by ord_item_ord_id;

CREATE INDEX order_lines_index_bitmap

ON TABLE order_lines (ord_item_ord_id)

AS 'BITMAP'

WITH DEFERRED REBUILD;

ALTER INDEX order_lines_index_bitmap on order_lines REBUILD;

show index on order_lines;

DROP INDEX IF EXISTS order_lines_index_bitmap ON order_lines;

DROP INDEX IF EXISTS order_lines_index ON order_lines;

====================================================

--compression techniques in Hive

set hive.exec.compress.output = true;

set mapreduce.output.fileoutputformat.compress=true;

set mapred.output.compression.codec= org.apache.hadoop.io.compress.BZip2Codec;

ï 4mc com.hadoop.compression.fourmc.FourMcCodec

ï gzip org.apache.hadoop.io.compress.GzipCodec

ï lzo com.hadoop.compression.lzo.LzopCodec

ï Snappy org.apache.hadoop.io.compress.SnappyCodec

ï bzip2 org.apache.hadoop.io.compress.BZip2Codec

ï lz4 org.apache.hadoop.io.compress.Lz4Codec

create table orders_compress ( line String)

row format delimited

fields terminated by '/n' ;

LOAD DATA lOCAL INPATH '/home/username/input/orders.csv' OVERWRITE INTO TABLE orders_compress;

INSERT OVERWRITE DIRECTORY '/user/username/output/compressed_orders_data/' SELECT * FROM orders_compress;

dfs -ls /user/hive/warehouse/ecom_db.db/orders_compress

dfs -ls /user/username/output/compressed_orders_data/

CREATE TABLE compress_orders_demo_load( line String)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '/n' STORED AS TEXTFILE

LOAD DATA INPATH '/user/username/output/compressed_orders_data/*' INTO TABLE compress_orders_demo_load;

select * from compress_orders_demo_load;

==========================================================

--file formats in Hive

1. RC file:

create table orders_rc(

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)stored as RCFILE ;

insert into orders_rc

select * from orders;

CREATE TABLE orders_rc2 (

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)stored as RCFILE

location '/user/hive/warehouse/ecom_db.db/orders_rc/';

CREATE TABLE orders_rc3 (

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)stored as RCFILE;

LOAD DATA local INPATH '/home/username/input/orders.rc' overwrite into table orders_rc3;

LOAD DATA local INPATH '/home/username/input/orders.csv' overwrite into table orders_rc2;

2. Avro file format:

create table orders_avro (

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)stored as AVRO ;

insert into orders_avro

select * from orders;

CREATE TABLE orders_avro2 (

ord_id int,

ord_date string,

ord_cust_id int,

ord_status string

)stored as AVRO;

LOAD DATA local INPATH '/home/username/input/orders_avro_data.avro' into table orders_avro2;

3. ORC file format:

create table orders_orc (

Order_id int,

Order_date string,

Order_cust_id int,

Order_status string

)

Stored as ORC;

insert into orders_orc

select * from orders;

============================================================================

--Hive functions

--substr

select substr("hi there how are you", 2,5);

select substr("hi there how are you", 2);

select substr("hi there how are you", -7,5);

select cast(substr(ord_date,6,2) as int) from orders limit 10;

--instr

select instr("hi there how are you", "there");

--split

select split("hi there how are you" ,'how');

--like

select "hi how are you" like 'are';

select "hi how are you" like '%are%';

--lcase or lower

select lcase("HELLO THERE");

--ucase or upper

select upper("hi there how are you");

--trim

select trim(' Hi there ');

--lpad and rpad

select lpad(12,4,'0');

===========================================================

--Date functions

--current_date

select current_date;

--Current_timestamp

select current_timestamp;

--Date_add

select date_add(current_date,3);

--Date_diff

select datediff(current_date,date_add(current_date,4));

--Date_format

select date_format(current_date,'y');

select date_format(ord_date,'YYYYMM') from orders limit 10;

--Day

select day(current_date);

--Todate

select current_timestamp,to_date(current_timestamp);

--To_unix_timestamp

select to_unix_timestamp(current_date);

--From_unixtime [ performance is better if the date is converted into number using this functio]

select from_unixtime(1599177634);

--Minute

select minute(current_timestamp);

--Month

select month(current_date);

--Months_between

select months_between(current_date,to_date('2021-09-21'));

--Next_day

select next_day(current_date,'MO');

===========================================================

--aggregation functions

--count

select ord_status, count(*) from orders

group by ord_status;

--sum

select o.ord_id,sum(ol.ord_item_subttl) ord_revenue

from orders o join order_lines ol

on o.ord_id= ol.ord_item_ord_id

group by o.ord_id

having ord_revenue>1000;

--min

select min(ord_item_subttl) from orders_line;

--sorting data

select o.ord_date, sum(ol.ord_item_subttl) daily_ord_revenue

from orders o join order_lines ol

on o.ord_id= ol.ord_item_ord_id

where o.ord_status in ('CLOSED','COMPLETE')

group by o.ord_date

order by o.ord_date;

=======================================================

--special functions

--case

select ord_id, ord_status, case ord_status

when 'CLOSED' then 'OK'

when 'COMPLETE' then 'OK'

else 'NOT OK'

end from orders;

select case

when ord_status in ('CLOSED','COMPLTE') then 'OK'

else 'NOT OK'

end from orders limit 10;

-nvl and subquery

select * from (

select nvl(ord_cust_id,'Missing status') as status from orders) as missing

where missing.status=='Missing status';

IOT CVGK

Breaking News

Important and useful Hive commands

Important and useful Hive commands

No comments

Author

Post of the Day

Blog Archive

Search This Blog

latest offers

New Offers

trending

Popular Posts

Followers

Tags

CPA Offers

Random Posts