Dynamic Script to Create Table Using Input table

This script will:

  1. Dynamically determine the configuration file name based on the output Hive table name.
  2. Read the configuration file to get the columns.
  3. Extract the schema from the Hive table.
  4. Generate the Oracle and Hive table creation scripts.
  5. Generate a HiveQL file to insert data into the new Hive table based on the specified columns.

 

#!/bin/bash

# Check if necessary arguments are provided
if [ "$#" -ne 8 ]; then
  echo "Usage: $0 <input_hive_table> <output_hive_table> <output_oracle_table> <oracle_server> <oracle_remote_dir> <load_type> <where_value> <mode>"
  exit 1
fi

INPUT_TABLE="$1"
OUTPUT_HIVE_TABLE="$2"
OUTPUT_ORACLE_TABLE="$3"
ORACLE_SERVER="$4"
ORACLE_REMOTE_DIR="$5"
LOAD_TYPE="$6"
WHERE_VALUE="$7"
MODE="$8"

# Extract database and table names for Hive and Oracle
HIVE_INPUT_DB=$(echo "$INPUT_TABLE" | cut -d '.' -f 1)
HIVE_INPUT_TBL=$(echo "$INPUT_TABLE" | cut -d '.' -f 2)
HIVE_OUTPUT_DB=$(echo "$OUTPUT_HIVE_TABLE" | cut -d '.' -f 1)
HIVE_OUTPUT_TBL=$(echo "$OUTPUT_HIVE_TABLE" | cut -d '.' -f 2)
ORACLE_OUTPUT_DB=$(echo "$OUTPUT_ORACLE_TABLE" | cut -d '.' -f 1)
ORACLE_OUTPUT_TBL=$(echo "$OUTPUT_ORACLE_TABLE" | cut -d '.' -f 2)

# Define the configuration file based on the output table name
CONFIG_FILE="${HIVE_OUTPUT_TBL}.ini"

# Check if the config file exists
if [ ! -f "$CONFIG_FILE" ]; then
  echo "Configuration file not found: $CONFIG_FILE"
  exit 1
fi

# Read the columns, condition properties, and date column from the config file
COLUMNS=$(grep '^columns=' "$CONFIG_FILE" | cut -d '=' -f 2)
CONDITIONS=$(grep '^conditions=' "$CONFIG_FILE" | cut -d '=' -f 2)
DATE_COLUMN=$(grep '^date_column=' "$CONFIG_FILE" | cut -d '=' -f 2)
IFS=',' read -r -a COLUMNS_ARRAY <<< "$COLUMNS"
IFS=',' read -r -a CONDITION_COLUMNS_ARRAY <<< "$CONDITIONS"

# Beeline command to describe the schema of the input table
BEELINE_DESC_CMD="USE $HIVE_INPUT_DB; DESCRIBE $HIVE_INPUT_TBL;"

# Execute the Beeline describe command and store the result
SCHEMA=$(beeline -u "jdbc:hive2://<hive_server>:<port>/<database>" -e "$BEELINE_DESC_CMD" | grep -Ev '^[+-]|^#|^[-]+$')

# Check if the describe command was successful
if [ $? -ne 0 ]; then
  echo "Failed to describe the input table: $INPUT_TABLE"
  exit 1
fi

# Function to convert Hive data types to Oracle data types
function convert_data_type {
  local hive_type="$1"
  case "$hive_type" in
    string) echo "VARCHAR2(4000)" ;;
    varchar\(*\)) echo "VARCHAR2${hive_type#varchar}" ;;
    int) echo "NUMBER" ;;
    bigint) echo "NUMBER" ;;
    double) echo "FLOAT" ;;
    float) echo "FLOAT" ;;
    boolean) echo "NUMBER(1)" ;;
    date) echo "DATE" ;;
    timestamp) echo "TIMESTAMP" ;;
    *) echo "VARCHAR2(4000)" ;;
  esac
}

# Function to check if a column is in the list of specified columns
function column_in_list {
  local column="$1"
  for col in "${COLUMNS_ARRAY[@]}"; do
    if [[ "$col" == "$column" ]]; then
      return 0
    fi
  done
  return 1
}

# Initialize variables for column names
COLUMN_NAMES=""
SELECT_COLUMNS=""

# Construct the CREATE TABLE statements
HIVE_CREATE_TABLE_CMD="CREATE TABLE $HIVE_OUTPUT_DB.$HIVE_OUTPUT_TBL ("
ORACLE_CREATE_TABLE_CMD="CREATE TABLE $ORACLE_OUTPUT_DB.$ORACLE_OUTPUT_TBL ("

# Parse the schema and add columns to the CREATE TABLE statements
while read -r line; do
  # Skip empty lines and lines that start with #
  if [[ -z "$line" || "$line" =~ ^# ]]; then
    continue
  fi

  # Extract column name and type
  column_name=$(echo "$line" | cut -d '|' -f 1 | xargs)
  column_type=$(echo "$line" | cut -d '|' -f 2 | xargs)

  # Convert Hive data type to Oracle data type
  oracle_type=$(convert_data_type "$column_type")

  # Check if the column should be included
  if [[ "$COLUMNS" == "ALL" || $(column_in_list "$column_name") ]]; then
    # Add column definition to the CREATE TABLE statements
    HIVE_CREATE_TABLE_CMD="$HIVE_CREATE_TABLE_CMD $column_name $column_type,"
    ORACLE_CREATE_TABLE_CMD="$ORACLE_CREATE_TABLE_CMD $column_name $oracle_type,"
    # Append column names for insert and select statements
    COLUMN_NAMES="$COLUMN_NAMES $column_name,"
    SELECT_COLUMNS="$SELECT_COLUMNS $column_name,"
  fi
done <<< "$SCHEMA"

# Remove the trailing comma and close the CREATE TABLE statements
HIVE_CREATE_TABLE_CMD="${HIVE_CREATE_TABLE_CMD%,} );"
ORACLE_CREATE_TABLE_CMD="${ORACLE_CREATE_TABLE_CMD%,} );"

# Remove the trailing comma for column names
COLUMN_NAMES="${COLUMN_NAMES%,}"
SELECT_COLUMNS="${SELECT_COLUMNS%,}"

# Determine the load type (overwrite or append) and construct the insert statement
if [ "$LOAD_TYPE" == "full" ]; then
  LOAD_CMD="INSERT OVERWRITE TABLE $HIVE_OUTPUT_DB.$HIVE_OUTPUT_TBL SELECT $SELECT_COLUMNS FROM $HIVE_INPUT_DB.$HIVE_INPUT_TBL"
else
  LOAD_CMD="INSERT INTO TABLE $HIVE_OUTPUT_DB.$HIVE_OUTPUT_TBL SELECT $SELECT_COLUMNS FROM $HIVE_INPUT_DB.$HIVE_INPUT_TBL"
fi

# Add the WHERE clause if provided
if [[ -n "$WHERE_VALUE" ]]; then
  WHERE_CLAUSE=" WHERE "
  if [ "$LOAD_TYPE" == "append" ]; then
    WHERE_CLAUSE="$WHERE_CLAUSE $DATE_COLUMN = '$WHERE_VALUE'"
  else
    for condition_column in "${CONDITION_COLUMNS_ARRAY[@]}"; do
      WHERE_CLAUSE="$WHERE_CLAUSE $condition_column = '$WHERE_VALUE' AND"
    done
    # Remove the trailing "AND"
    WHERE_CLAUSE="${WHERE_CLAUSE%AND}"
  fi
  LOAD_CMD="$LOAD_CMD $WHERE_CLAUSE"
fi
LOAD_CMD="$LOAD_CMD;"

# Generate the SQL file to insert data into the new Hive table
INSERT_SQL_FILE="/tmp/insert_${HIVE_OUTPUT_TBL}.hql"
echo "$LOAD_CMD" > "$INSERT_SQL_FILE"

# Write the Oracle CREATE TABLE statement to a SQL file
SQL_FILE="/tmp/create_${ORACLE_OUTPUT_TBL}.sql"
echo "$ORACLE_CREATE_TABLE_CMD" > "$SQL_FILE"

# Execute the CREATE TABLE statement for Hive using Beeline
beeline -u "jdbc:hive2://<hive_server>:<port>/<database>" -e "USE $HIVE_OUTPUT_DB; $HIVE_CREATE_TABLE_CMD"

# Check if the create table command was successful
if [ $? -ne 0 ]; then
  echo "Failed to create the new Hive table: $OUTPUT_HIVE_TABLE"
  exit 1
fi

echo "Hive table $HIVE_OUTPUT_TBL created successfully in the database $HIVE_OUTPUT_DB."

# Transfer the Oracle CREATE TABLE SQL file to the remote server
scp "$SQL_FILE" "$ORACLE_SERVER:$ORACLE_REMOTE_DIR"

# Check if the scp command was successful
if [ $? -ne 0 ]; then
  echo "Failed to transfer the Oracle CREATE TABLE SQL file to the remote server"
  exit 1
fi

# Run the SQL file on the remote Oracle server using sqlplus
ssh "$ORACLE_SERVER" "cd $ORACLE_REMOTE_DIR; sqlplus your_username/your_password @$(basename $SQL_FILE)"

# Check if the sqlplus command was successful
if [ $? -ne 0 ]; then
  echo "Failed to create the new Oracle table: $OUTPUT_ORACLE_TABLE"
  exit 1
fi

echo "Oracle table $ORACLE_OUTPUT_TBL created successfully in the Oracle database $ORACLE_OUTPUT_DB."

echo "HiveQL file for loading data into the new Hive table is created: $INSERT_SQL_FILE"

# Define the job name based on the output table name
JOB_NAME="hive_load_${HIVE_OUTPUT_TBL}"

# Define paths
LOG_DIR="/path/to/logs"
AUTOSYS_JIL_FILE="/tmp/${JOB_NAME}.jil"

# Create the AutoSys job definition file
if [ "$MODE" == "F" ]; then
  COMMAND="beeline -u 'jdbc:hive2://<hive_server>:<port>/<database>' -f '$INSERT_SQL_FILE'"
else
  COMMAND="beeline -u 'jdbc:hive2://<hive_server>:<port>/<database>' -e 'INSERT INTO TABLE $HIVE_OUTPUT_DB.$HIVE_OUTPUT_TBL SELECT $SELECT_COLUMNS FROM $HIVE_INPUT_DB.$HIVE_INPUT_TBL WHERE $DATE_COLUMN = $WHERE_VALUE;'"
fi

# Write the AutoSys JIL file
cat <<EOL > "$AUTOSYS_JIL_FILE"
insert_job: $JOB_NAME
job_type: c
command: $COMMAND
machine: <machine_name>
owner: <owner_name>
permission: gx,ge
date_conditions: 1
days_of_week: all
start_times: "00:00"
description: "AutoSys job to load data into Hive table $HIVE_OUTPUT_TBL"
std_out_file: $LOG_DIR/$JOB_NAME.out
std_err_file: $LOG_DIR/$JOB_NAME.err
alarm_if_fail: 1
EOL

# Print a message indicating the job has been created
echo "AutoSys job definition file created: $AUTOSYS_JIL_FILE"

 

# Configuration for Hive and Oracle Table Creation

# List of columns to be included. Use "ALL" to include all columns.
columns=column1,column2,column3

# The condition columns to be used in the WHERE clause for full data load.
conditions=condition_column1,condition_column2

# The date column to be used in the WHERE clause for append data load.
date_column=date_column_name

 

#!/bin/bash

# Check if necessary arguments are provided
if [ "$#" -ne 8 ]; then
  echo "Usage: $0 <input_hive_table> <output_hive_table> <output_oracle_table> <oracle_server> <oracle_remote_dir> <load_type> <where_value> <mode>"
  exit 1
fi

INPUT_TABLE="$1"
OUTPUT_HIVE_TABLE="$2"
OUTPUT_ORACLE_TABLE="$3"
ORACLE_SERVER="$4"
ORACLE_REMOTE_DIR="$5"
LOAD_TYPE="$6"
WHERE_VALUE="$7"
MODE="$8"

# Extract database and table names for Hive and Oracle
HIVE_INPUT_DB=$(echo "$INPUT_TABLE" | cut -d '.' -f 1)
HIVE_INPUT_TBL=$(echo "$INPUT_TABLE" | cut -d '.' -f 2)
HIVE_OUTPUT_DB=$(echo "$OUTPUT_HIVE_TABLE" | cut -d '.' -f 1)
HIVE_OUTPUT_TBL=$(echo "$OUTPUT_HIVE_TABLE" | cut -d '.' -f 2)
ORACLE_OUTPUT_DB=$(echo "$OUTPUT_ORACLE_TABLE" | cut -d '.' -f 1)
ORACLE_OUTPUT_TBL=$(echo "$OUTPUT_ORACLE_TABLE" | cut -d '.' -f 2)

# Define the configuration files based on the output table name and for the AutoSys job
CONFIG_FILE="${HIVE_OUTPUT_TBL}.ini"
AUTOSYS_CONFIG_FILE="job_config.ini"

# Check if the config files exist
if [ ! -f "$CONFIG_FILE" ]; then
  echo "Configuration file not found: $CONFIG_FILE"
  exit 1
fi

if [ ! -f "$AUTOSYS_CONFIG_FILE" ]; then
  echo "AutoSys configuration file not found: $AUTOSYS_CONFIG_FILE"
  exit 1
fi

# Read the columns, condition properties, date column, and partition column from the config file
COLUMNS=$(grep '^columns=' "$CONFIG_FILE" | cut -d '=' -f 2)
CONDITIONS=$(grep '^conditions=' "$CONFIG_FILE" | cut -d '=' -f 2)
DATE_COLUMN=$(grep '^date_column=' "$CONFIG_FILE" | cut -d '=' -f 2)
PARTITION_COLUMN=$(grep '^partition_column=' "$CONFIG_FILE" | cut -d '=' -f 2)
IFS=',' read -r -a COLUMNS_ARRAY <<< "$COLUMNS"
IFS=',' read -r -a CONDITION_COLUMNS_ARRAY <<< "$CONDITIONS"

# Read machine name and owner from AutoSys config file
MACHINE_NAME=$(grep '^name=' "$AUTOSYS_CONFIG_FILE" -A1 | grep 'machine' -A1 | tail -n1 | cut -d '=' -f2)
OWNER_NAME=$(grep '^name=' "$AUTOSYS_CONFIG_FILE" -A1 | grep 'owner' -A1 | tail -n1 | cut -d '=' -f2)

# Beeline command to describe the schema of the input table
BEELINE_DESC_CMD="USE $HIVE_INPUT_DB; DESCRIBE $HIVE_INPUT_TBL;"

# Execute the Beeline describe command and store the result
SCHEMA=$(beeline -u "jdbc:hive2://<hive_server>:<port>/<database>" -e "$BEELINE_DESC_CMD" | grep -Ev '^[+-]|^#|^[-]+$')

# Check if the describe command was successful
if [ $? -ne 0 ]; then
  echo "Failed to describe the input table: $INPUT_TABLE"
  exit 1
fi

# Function to convert Hive data types to Oracle data types
function convert_data_type {
  local hive_type="$1"
  case "$hive_type" in
    string) echo "VARCHAR2(4000)" ;;
    varchar\(*\)) echo "VARCHAR2${hive_type#varchar}" ;;
    int) echo "NUMBER" ;;
    bigint) echo "NUMBER" ;;
    double) echo "FLOAT" ;;
    float) echo "FLOAT" ;;
    boolean) echo "NUMBER(1)" ;;
    date) echo "DATE" ;;
    timestamp) echo "TIMESTAMP" ;;
    *) echo "VARCHAR2(4000)" ;;
  esac
}

# Function to check if a column is in the list of specified columns
function column_in_list {
  local column="$1"
  for col in "${COLUMNS_ARRAY[@]}"; do
    if [[ "$col" == "$column" ]]; then
      return 0
    fi
  done
  return 1
}

# Initialize variables for column names
COLUMN_NAMES=""
SELECT_COLUMNS=""
PARTITION_COLUMN_DEF=""

# Construct the CREATE TABLE statements
HIVE_CREATE_TABLE_CMD="CREATE TABLE $HIVE_OUTPUT_DB.$HIVE_OUTPUT_TBL ("
ORACLE_CREATE_TABLE_CMD="CREATE TABLE $ORACLE_OUTPUT_DB.$ORACLE_OUTPUT_TBL ("

# Parse the schema and add columns to the CREATE TABLE statements
while read -r line; do
  # Skip empty lines and lines that start with #
  if [[ -z "$line" || "$line" =~ ^# ]]; then
    continue
  fi

  # Extract column name and type
  column_name=$(echo "$line" | cut -d '|' -f 1 | xargs)
  column_type=$(echo "$line" | cut -d '|' -f 2 | xargs)

  # Convert Hive data type to Oracle data type
  oracle_type=$(convert_data_type "$column_type")

  # Check if the column should be included
  if [[ "$COLUMNS" == "ALL" || $(column_in_list "$column_name") ]]; then
    # Add column definition to the CREATE TABLE statements
    if [[ "$PARTITION_COLUMN" == "$column_name" ]]; then
      PARTITION_COLUMN_DEF="$column_name $column_type"
    else
      HIVE_CREATE_TABLE_CMD="$HIVE_CREATE_TABLE_CMD $column_name $column_type,"
    fi
    ORACLE_CREATE_TABLE_CMD="$ORACLE_CREATE_TABLE_CMD $column_name $oracle_type,"
    # Append column names for insert and select statements
    COLUMN_NAMES="$COLUMN_NAMES $column_name,"
    SELECT_COLUMNS="$SELECT_COLUMNS $column_name,"
  fi
done <<< "$SCHEMA"

# Remove the trailing comma and close the CREATE TABLE statements
HIVE_CREATE_TABLE_CMD="${HIVE_CREATE_TABLE_CMD%,} )"
if [[ -n "$PARTITION_COLUMN_DEF" ]]; then
  HIVE_CREATE_TABLE_CMD="$HIVE_CREATE_TABLE_CMD PARTITIONED BY ($PARTITION_COLUMN_DEF);"
else
  HIVE_CREATE_TABLE_CMD="$HIVE_CREATE_TABLE_CMD;"
fi
ORACLE_CREATE_TABLE_CMD="${ORACLE_CREATE_TABLE_CMD%,} );"

# Remove the trailing comma for column names
COLUMN_NAMES="${COLUMN_NAMES%,}"
SELECT_COLUMNS="${SELECT_COLUMNS%,}"

# Determine the load type (overwrite or append) and construct the insert statement
if [ "$LOAD_TYPE" == "F" ]; then
  LOAD_CMD="INSERT OVERWRITE TABLE $HIVE_OUTPUT_DB.$HIVE_OUTPUT_TBL SELECT $SELECT_COLUMNS FROM $HIVE_INPUT_DB.$HIVE_INPUT_TBL"
else
  LOAD_CMD="INSERT INTO TABLE $HIVE_OUTPUT_DB.$HIVE_OUTPUT_TBL SELECT $SELECT_COLUMNS FROM $HIVE_INPUT_DB.$HIVE_INPUT_TBL"
fi

# Add the WHERE clause if provided
if [[ -n "$WHERE_VALUE" ]]; then
  WHERE_CLAUSE=" WHERE "
  if [ "$LOAD_TYPE" == "I" ]; then
    WHERE_CLAUSE="$WHERE_CLAUSE $DATE_COLUMN = '$WHERE_VALUE'"
  else
    for condition_column in "${CONDITION_COLUMNS_ARRAY[@]}"; do
      WHERE_CLAUSE="$WHERE_CLAUSE $condition_column = '$WHERE_VALUE' AND"
    done
    # Remove the trailing "AND"
    WHERE_CLAUSE="${WHERE_CLAUSE%AND}"
  fi
  LOAD_CMD="$LOAD_CMD $WHERE_CLAUSE"
fi
LOAD_CMD="$LOAD_CMD;"

# Generate the SQL file to insert data into the new Hive table
INSERT_SQL_FILE="/tmp/insert_${HIVE_OUTPUT_TBL}.hql"
echo "$LOAD_CMD" > "$INSERT_SQL_FILE"

# Write the Oracle CREATE TABLE statement to a SQL file
SQL_FILE="/tmp/create_${ORACLE_OUTPUT_TBL}.sql"
echo "$ORACLE_CREATE_TABLE_CMD" > "$SQL_FILE"

# Add the GRANT statement to the Oracle SQL file
echo "GRANT SELECT ON ${ORACLE_OUTPUT_DB}.${ORACLE_OUTPUT_TBL} TO read;" >> "$SQL_FILE"

# Execute the CREATE TABLE statement for Hive using Beeline
beeline -u "jdbc:hive2://<hive_server>:<port>/<database>" -e "USE $HIVE_OUTPUT_DB; $HIVE_CREATE_TABLE_CMD"

# Check if the create table command was successful
if [ $? -ne 0 ]; then
  echo "Failed to create the new Hive table: $OUTPUT_HIVE_TABLE"
  exit 1
fi

echo "Hive table $HIVE_OUTPUT_TBL created successfully in the database $HIVE_OUTPUT_DB."

# Transfer the Oracle CREATE TABLE SQL file to the remote server
scp "$SQL_FILE" "$ORACLE_SERVER:$ORACLE_REMOTE_DIR"

# Check if the scp command was successful
if [ $? -ne 0 ]; then
  echo "Failed to transfer the Oracle CREATE TABLE SQL file to the remote server"
  exit 1
fi

#