spark to oracle
To query an Oracle table using Spark, you need to set up a JDBC connection to the Oracle database. Here’s a step-by-step approach:
Prerequisites:
- Oracle JDBC Driver: Ensure the Oracle JDBC driver (
ojdbc8.jar
) is available.- Place it in a directory accessible to your Spark job or specify it using
--jars
in the Spark submit command.
Example Spark Submit:
- Place it in a directory accessible to your Spark job or specify it using
- Database Connection Details:
- JDBC URL format:
jdbc:oracle:thin:@hostname:port:service_name
- Username and password stored securely using Hadoop credential providers or passed securely in properties.
- JDBC URL format:
import org.apache.hadoop.conf.Configuration import org.apache.hadoop.security.alias.CredentialProviderFactory import org.apache.spark.sql.SparkSession // Create a SparkSession val spark = SparkSession.builder() .appName("QueryOracleTable") .getOrCreate() // Load credentials from HDFS JCEKS val conf = new Configuration() val jceksUri = "jceks://hdfs/path/to/creds.jceks" conf.set(CredentialProviderFactory.CREDENTIAL_PROVIDER_PATH, jceksUri) // Retrieve Oracle credentials val credentialProvider = CredentialProviderFactory.getProviders(conf).get(0) val dbUsername = new String(credentialProvider.getCredentialEntry("dbUsername").getCredential) val dbPassword = new String(credentialProvider.getCredentialEntry("dbPassword").getCredential) // Oracle connection properties val jdbcUrl = "jdbc:oracle:thin:@hostname:port:service_name" val connectionProperties = new java.util.Properties() connectionProperties.put("user", dbUsername) connectionProperties.put("password", dbPassword) connectionProperties.put("driver", "oracle.jdbc.driver.OracleDriver") // Query the Oracle table val oracleDF = spark.read .jdbc(jdbcUrl, "oracle_table_name", connectionProperties) // Show the queried data oracleDF.show()
Write to oracle
import org.apache.spark.sql.SparkSession val spark = SparkSession.builder() .appName("HiveToOracleExample") .enableHiveSupport() .getOrCreate() // Read from Hive table val hiveData = spark.sql("SELECT * FROM your_hive_table") // Oracle connection properties val jdbcUrl = "jdbc:oracle:thin:@hostname:port:service_name" val connectionProperties = new java.util.Properties() connectionProperties.put("user", "your_username") connectionProperties.put("password", "your_password") connectionProperties.put("driver", "oracle.jdbc.driver.OracleDriver") // Write to Oracle table hiveData.write .mode("append") .option("batchsize", "1000") .option("numPartitions", "4") .jdbc(jdbcUrl, "oracle_table_name", connectionProperties)