spark to oracle

To query an Oracle table using Spark, you need to set up a JDBC connection to the Oracle database. Here’s a step-by-step approach:

Prerequisites:

Oracle JDBC Driver: Ensure the Oracle JDBC driver (ojdbc8.jar) is available.
- Place it in a directory accessible to your Spark job or specify it using --jars in the Spark submit command.
Example Spark Submit:

spark-submit --jars /path/to/ojdbc8.jar your_spark_app.jar
Database Connection Details:
- JDBC URL format: jdbc:oracle:thin:@hostname:port:service_name
- Username and password stored securely using Hadoop credential providers or passed securely in properties.

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.security.alias.CredentialProviderFactory
import org.apache.spark.sql.SparkSession

// Create a SparkSession
val spark = SparkSession.builder()
  .appName("QueryOracleTable")
  .getOrCreate()

// Load credentials from HDFS JCEKS
val conf = new Configuration()
val jceksUri = "jceks://hdfs/path/to/creds.jceks"
conf.set(CredentialProviderFactory.CREDENTIAL_PROVIDER_PATH, jceksUri)

// Retrieve Oracle credentials
val credentialProvider = CredentialProviderFactory.getProviders(conf).get(0)
val dbUsername = new String(credentialProvider.getCredentialEntry("dbUsername").getCredential)
val dbPassword = new String(credentialProvider.getCredentialEntry("dbPassword").getCredential)

// Oracle connection properties
val jdbcUrl = "jdbc:oracle:thin:@hostname:port:service_name"
val connectionProperties = new java.util.Properties()
connectionProperties.put("user", dbUsername)
connectionProperties.put("password", dbPassword)
connectionProperties.put("driver", "oracle.jdbc.driver.OracleDriver")

// Query the Oracle table
val oracleDF = spark.read
  .jdbc(jdbcUrl, "oracle_table_name", connectionProperties)

// Show the queried data
oracleDF.show()

Write to oracle

import org.apache.spark.sql.SparkSession

val spark = SparkSession.builder()
  .appName("HiveToOracleExample")
  .enableHiveSupport()
  .getOrCreate()

// Read from Hive table
val hiveData = spark.sql("SELECT * FROM your_hive_table")

// Oracle connection properties
val jdbcUrl = "jdbc:oracle:thin:@hostname:port:service_name"
val connectionProperties = new java.util.Properties()
connectionProperties.put("user", "your_username")
connectionProperties.put("password", "your_password")
connectionProperties.put("driver", "oracle.jdbc.driver.OracleDriver")

// Write to Oracle table
hiveData.write
  .mode("append")
  .option("batchsize", "1000")
  .option("numPartitions", "4")
  .jdbc(jdbcUrl, "oracle_table_name", connectionProperties)