How to Handle Schema Changes/Evolutes in Hive ORC tables like Column additions at the Source DB

#Copy the Original AVSC File to HDFS Location
hadoop fs -cp /data/raw/oltp/cas/schema/current/CLAIM_CENTER.CC_ADDRESS.avsc /QA/giri/poc/schemachanges/cas/cc_address

#Copy the Original AVSC FIle to Local
hadoop fs -copyToLocal /data/raw/oltp/cas/schema/current/CLAIM_CENTER.CC_ADDRESS.avsc /opt/md/WIP/giri/poc/schemachanges/cas/cc_address
#Create a Avro Table with Original Schema
CREATE EXTERNAL TABLE GRVTESTSCHEMA.CC_ADDRESS_AVRO ROW FORMAT SERDE ‘org.apache.hadoop.hive.serde2.avro.AvroSerDe’ STORED AS INPUTFORMAT ‘org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat’ OUTPUTFORMAT ‘org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat’ LOCATION ‘hdfs:///data/raw/oltp/cas/landing/CLAIM_CENTER.CC_ADDRESS/’ TBLPROPERTIES (‘avro.schema.url’= ‘hdfs:///QA/giri/poc/schemachanges/cas/cc_address/CLAIM_CENTER.CC_ADDRESS.avsc’);

hive> select * from GRVTESTSCHEMA.CC_ADDRESS_AVRO limit 2;
OK
INSERT 2015-06-26 17:38:35.733405 202.0 0 3.0 2012-10-24:02:57:47.407000000 NULL NULL NULL NULL NULL 3.0 NULL 2012-10-24:02:57:47.407000000 435.0 NULLNULL 10001.0 NULL dad:7045353 NULL 0.0 NULL 0.0 NULL NULL 10005.0 NULL 4.0 10049.0 NULL NULL NULL NULL
INSERT 2015-06-26 17:38:35.733405 202.0 0 3.0 2012-10-24:02:57:47.407000000 NULL NULL NULL NULL NULL 3.0 NULL 2012-10-24:02:57:47.407000000 638.0 NULLNULL 10001.0 NULL dad:7045556 NULL 0.0 NULL 0.0 NULL NULL 10005.0 NULL 4.0 10024.0 NULL NULL NULL NULL
#Create ORC table now with original Schema
CREATE EXTERNAL TABLE GRVTESTSCHEMA.CC_ADDRESS_ORC
(TXOPTYPE STRING,TXTIMESTAMP TIMESTAMP,LOADCOMMANDID DOUBLE,BATCHGEOCODE STRING,CREATEUSERID DOUBLE,ADDRESSLINE1 STRING,CREATETIME TIMESTAMP,ADDRESSLINE2 STRING,ADDRESSLINE3 STRING,CITY STRING,LATITUDE DOUBLE,LONGITUDE DOUBLE,UPDATEUSERID DOUBLE,POSTALCODEDENORM STRING,UPDATETIME TIMESTAMP,ID DOUBLE,HTMID DOUBLE,DESCRIPTION STRING,ADDRESSVERIFYSTATUS_AAA DOUBLE,VALIDUNTIL TIMESTAMP,PUBLICID STRING,ARCHIVEPARTITION DOUBLE,BEANVERSION DOUBLE,CITYDENORM STRING,RETIRED DOUBLE,COUNTY STRING,DPV_AAA STRING,ADDRESSTYPE DOUBLE,ADMIN STRING,GEOCODESTATUS DOUBLE,STATE DOUBLE,ADDRESSBOOKUID STRING,COUNTRY DOUBLE,POSTALCODE STRING,PHONE STRING) PARTITIONED BY (load_year INT,load_month INT,load_date INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ’01’ STORED AS ORC LOCATION ‘/data/raw/oltp/cas/staging/CC_ADDRESS
‘;

#Insert into the ORC
SET tez.queue.nam=SIU;SET hive.exec.dynamic.partition = true;SET mapreduce.framework.name=yarn-tez;SET hive.vectorized.execution.enabled=true;SET hive.cbo.enable = true;SET hive.exec.parallel=true;SET hive.exec.mode.local.auto=true;SET hive.merge.mapfiles=true;SET hive.merge.mapredfiles=true;SET hive.mapred.supports.subdirectories=true;SET tez.runtime.intermediate-input.is-compressed=true;SET tez.runtime.intermediate-output.should-compress=true;SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;SET mapred.input.dir.recursive=true;SET hive.exec.dynamic.partition.mode=nonstrict;SET hive.execution.engine=tez;SET mapreduce.framework.name=yarn-tez;SET hive.exec.max.dynamic.partitions=10000;SET hive.exec.max.dynamic.partitions.pernode=100000; INSERT OVERWRITE TABLE GRVTESTSCHEMA.CC_ADDRESS_ORC
PARTITION (LOAD_YEAR,LOAD_MONTH,LOAD_DATE) SELECT TXOPTYPE,TXTIMESTAMP,LOADCOMMANDID,BATCHGEOCODE,CREATEUSERID,ADDRESSLINE1,from_unixtime(unix_timestamp(CREATETIME,’yyyy-MM-dd:HH:mm:ss’)),ADDRESSLINE2,ADDRESSLINE3,CITY,LATITUDE,LONGITUDE,UPDATEUSERID,POSTALCODEDENORM,from_unixtime(unix_timestamp(UPDATETIME,’yyyy-MM-dd:HH:mm:ss’)),ID,HTMID,DESCRIPTION,ADDRESSVERIFYSTATUS_AAA,from_unixtime(unix_timestamp(VALIDUNTIL,’yyyy-MM-dd:HH:mm:ss’)),PUBLICID,ARCHIVEPARTITION,BEANVERSION,CITYDENORM,RETIRED,COUNTY,DPV_AAA,ADDRESSTYPE,ADMIN,GEOCODESTATUS,STATE,ADDRESSBOOKUID,COUNTRY,POSTALCODE,PHONE,YEAR(TO_DATE(txtimestamp)),MONTH(TO_DATE(txtimestamp)),DAY(TO_DATE(txtimestamp)) FROM GRVTESTSCHEMA.CC_ADDRESS_AVRO;
———————————————————————————————————–After Schema Changes
#Assume Oracle cc_address table is added with 5 additional columns
#NEWCOL1 string,NEWCOL2 double,NEWCOL3 int,NEWCOL4 string,NEWCOL5 int
#Move this to New Location in HDFS
hadoop fs -copyFromLocal /opt/md/WIP/giri/LegacyDataAcquistition/poc/schemachanges/cas/cc_address/CLAIM_CENTER.CC_ADDRESS_modified.avsc /QA/giri/poc/schemachanges/cas/cc_address

#Drop the Original AVRO table
DROP TABLE GRVTESTSCHEMA.CC_ADDRESS_AVRO;

#Create a Avro Table with Modified Schema
CREATE EXTERNAL TABLE GRVTESTSCHEMA.CC_ADDRESS_AVRO ROW FORMAT SERDE ‘org.apache.hadoop.hive.serde2.avro.AvroSerDe’ STORED AS INPUTFORMAT ‘org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat’ OUTPUTFORMAT ‘org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat’ LOCATION ‘hdfs:///data/raw/oltp/cas/landing/CLAIM_CENTER.CC_ADDRESS/’ TBLPROPERTIES (‘avro.schema.url’= ‘hdfs:///QA/giri/poc/schemachanges/cas/cc_address/CLAIM_CENTER.CC_ADDRESS_modified.avsc’);

#See the New Schema Changes
desc formatted GRVTESTSCHEMA.CC_ADDRESS_AVRO;

#Select few columns from the Newly Modified avro table.
hive> select * from GRVTESTSCHEMA.CC_ADDRESS_AVRO limit 2;
OK
INSERT 2015-06-26 17:38:35.733405 202.0 0 3.0 2012-10-24:02:57:47.407000000 NULL NULL NULL NULL NULL NULL 3.0 NULL 2012-10-24:02:57:47.407000000 435.0 NULL NULL 10001.0 NULL dad:7045353 NULL 0.0 NULL 0.0 NULL NULL 10005.0 NULL 4.0 10049.0 NULL NULL NULL NULL NULL NULL NULL NULL NULL
INSERT 2015-06-26 17:38:35.733405 202.0 0 3.0 2012-10-24:02:57:47.407000000 NULL NULL NULL NULL NULL NULL 3.0 NULL 2012-10-24:02:57:47.407000000 638.0 NULL NULL 10001.0 NULL dad:7045556 NULL 0.0 NULL 0.0 NULL NULL 10005.0 NULL 4.0 10024.0 NULL NULL NULL NULL NULL NULL NULL NULL NULL
Time taken: 0.103 seconds, Fetched: 2 row(s)

#Alter the ORC table
USE GRVTESTSCHEMA;
use grvtestschema;ALTER TABLE CC_ADDRESS_ORC ADD COLUMNS (newcol1 string,newcol2 string,newcol3 string,newcol4 string,newcol5 string);

#Ingest the New AVRO data into the ORC table.
SET tez.queue.nam=SIU;SET hive.exec.dynamic.partition = true;SET mapreduce.framework.name=yarn-tez;SET hive.vectorized.execution.enabled=true;SET hive.cbo.enable = true;SET hive.exec.parallel=true;SET hive.exec.mode.local.auto=true;SET hive.merge.mapfiles=true;SET hive.merge.mapredfiles=true;SET hive.mapred.supports.subdirectories=true;SET tez.runtime.intermediate-input.is-compressed=true;SET tez.runtime.intermediate-output.should-compress=true;SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;SET mapred.input.dir.recursive=true;SET hive.exec.dynamic.partition.mode=nonstrict;SET hive.execution.engine=tez;SET mapreduce.framework.name=yarn-tez;SET hive.exec.max.dynamic.partitions=10000;SET hive.exec.max.dynamic.partitions.pernode=100000; INSERT OVERWRITE TABLE GRVTESTSCHEMA.CC_ADDRESS_ORC
PARTITION (LOAD_YEAR,LOAD_MONTH,LOAD_DATE) SELECT TXOPTYPE,TXTIMESTAMP,LOADCOMMANDID,BATCHGEOCODE,CREATEUSERID,ADDRESSLINE1,from_unixtime(unix_timestamp(CREATETIME,’yyyy-MM-dd:HH:mm:ss’)),ADDRESSLINE2,ADDRESSLINE3,CITY,LATITUDE,LONGITUDE,UPDATEUSERID,POSTALCODEDENORM,from_unixtime(unix_timestamp(UPDATETIME,’yyyy-MM-dd:HH:mm:ss’)),ID,HTMID,DESCRIPTION,ADDRESSVERIFYSTATUS_AAA,from_unixtime(unix_timestamp(VALIDUNTIL,’yyyy-MM-dd:HH:mm:ss’)),PUBLICID,ARCHIVEPARTITION,BEANVERSION,CITYDENORM,RETIRED,COUNTY,DPV_AAA,ADDRESSTYPE,ADMIN,GEOCODESTATUS,STATE,ADDRESSBOOKUID,COUNTRY,POSTALCODE,PHONE,newcol1,newcol2,newcol3,newcol4,newcol5,YEAR(TO_DATE(txtimestamp)),MONTH(TO_DATE(txtimestamp)),DAY(TO_DATE(txtimestamp)) FROM GRVTESTSCHEMA.CC_ADDRESS_AVRO LIMIT 10;

Existing Records in Hive will get NULL for all the new columns added and New Upcoming Records from AVRO will get proper values for all the new columns added.

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s