[me@myhost ~]$ spark2-shell
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
20/02/02 06:08:36 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
Spark context Web UI available at http://myhost:4041
Spark context available as 'sc' (master = yarn, app id = application_1570004862164_137877).
Spark session available as 'spark'.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.3.0.cloudera4
/_/
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_141)
Type in expressions to have them evaluated.
Type :help for more information.
scala> case class MyClass(mystring: String, myboolean: Boolean, myint: Int, myLong: Long, myfloat: Float, mydouble: Double)
defined class MyClass
scala> val df = Seq(MyClass("a", true, 1, 1L, 1/7f, 1/7f), MyClass("b", false, 2, 2L, 2.0f, 2.0d)).toDF
df: org.apache.spark.sql.DataFrame = [mystring: string, myboolean: boolean ... 4 more fields]
scala> import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.SaveMode
scala> df.write.mode(SaveMode.Overwrite).format("parquet").save("myclass_parquet")
scala> df.write.mode(SaveMode.Overwrite).parquet("myclass_parquet")
scala> df.write.mode(SaveMode.Overwrite).format("avro").save("myclass_avro")
org.apache.spark.sql.AnalysisException: Failed to find data source: avro. Please find an Avro package at http://spark.apache.org/third-party-projects.html;
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:634)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:241)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:225)
... 49 elided
scala> df.write.mode(SaveMode.Overwrite).avro("myclass_avro")
:27: error: value avro is not a member of org.apache.spark.sql.DataFrameWriter[org.apache.spark.sql.Row]
df.write.mode(SaveMode.Overwrite).avro("myclass_avro")
^
scala> import com.databricks.spark.avro._
:24: error: object databricks is not a member of package com
import com.databricks.spark.avro._
^
[me@myhost ~]$ wget https://repo1.maven.org/maven2/com/databricks/spark-avro_2.11/4.0.0/spark-avro_2.11-4.0.0.jar
[me@myhost ~]$ spark2-shell --jars spark-avro_2.11-4.0.0.jar
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Spark context Web UI available at http://myhost:4040
Spark context available as 'sc' (master = yarn, app id = application_1570004862164_137923).
Spark session available as 'spark'.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.3.0.cloudera4
/_/
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_141)
Type in expressions to have them evaluated.
Type :help for more information.
scala> case class MyClass(mystring: String, myboolean: Boolean, myint: Int, myLong: Long, myfloat: Float, mydouble: Double)
defined class MyClass
scala> val df = Seq(MyClass("a", true, 1, 1L, 1/7f, 1/7f), MyClass("b", false, 2, 2L, 2.0f, 2.0d)).toDF
df: org.apache.spark.sql.DataFrame = [mystring: string, myboolean: boolean ... 4 more fields]
scala>
scala> import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.SaveMode
scala> df.write.mode(SaveMode.Overwrite).format("avro").save("myclass_avro")
org.apache.spark.sql.AnalysisException: Failed to find data source: avro. Please find an Avro package at http://spark.apache.org/third-party-projects.html;
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:634)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:241)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:225)
... 49 elided
scala> df.write.mode(SaveMode.Overwrite).format("com.databricks.spark.avro").save("myclass_avro")
scala> df.write.mode(SaveMode.Overwrite).avro("myclass_avro")
:27: error: value avro is not a member of org.apache.spark.sql.DataFrameWriter[org.apache.spark.sql.Row]
df.write.mode(SaveMode.Overwrite).avro("myclass_avro")
^
scala> import com.databricks.spark.avro._
import com.databricks.spark.avro._
scala> df.write.mode(SaveMode.Overwrite).avro("myclass_avro")
scala>
[me@myhost ~]$ hdfs dfs -ls myclass_parquet
Found 3 items
-rw-r--r-- 2 me me 0 2020-02-02 05:40 myclass_parquet/_SUCCESS
-rw-r--r-- 2 me me 1210 2020-02-02 05:40 myclass_parquet/part-00000-f8303ddc-21af-4d58-9697-ae2f470e4791-c000.snappy.parquet
-rw-r--r-- 2 me me 1210 2020-02-02 05:40 myclass_parquet/part-00001-f8303ddc-21af-4d58-9697-ae2f470e4791-c000.snappy.parquet
[me@myhost ~]$ hadoop fs -copyToLocal myclass_parquet/part-00000-f8303ddc-21af-4d58-9697-ae2f470e4791-c000.snappy.parquet
[me@myhost ~]$ parquet-tools schema part-00000-f8303ddc-21af-4d58-9697-ae2f470e4791-c000.snappy.parquet
message spark_schema {
optional binary mystring (UTF8);
required boolean myboolean;
required int32 myint;
required int64 myLong;
required float myfloat;
required double mydouble;
}
[me@myhost ~]$ parquet-tools cat part-00000-f8303ddc-21af-4d58-9697-ae2f470e4791-c000.snappy.parquet
mystring = a
myboolean = true
myint = 1
myLong = 1
myfloat = 0.14285715
mydouble = 0.1428571492433548
[me@myhost ~]$ hadoop fs -ls myclass_avro
Found 3 items
-rw-r--r-- 2 me me 0 2020-02-02 06:31 myclass_avro/_SUCCESS
-rw-r--r-- 2 me me 363 2020-02-02 06:31 myclass_avro/part-00000-31b5df6e-2ab4-4310-b4dc-11346dc7fc88-c000.avro
-rw-r--r-- 2 me me 363 2020-02-02 06:31 myclass_avro/part-00001-31b5df6e-2ab4-4310-b4dc-11346dc7fc88-c000.avro
[me@myhost ~]$ hadoop fs -copyToLocal myclass_avro/part-00000-31b5df6e-2ab4-4310-b4dc-11346dc7fc88-c000.avro
[me@myhost ~]$ avro-tools getschema part-00000-31b5df6e-2ab4-4310-b4dc-11346dc7fc88-c000.avro
{
"type" : "record",
"name" : "topLevelRecord",
"fields" : [ {
"name" : "mystring",
"type" : [ "string", "null" ]
}, {
"name" : "myboolean",
"type" : "boolean"
}, {
"name" : "myint",
"type" : "int"
}, {
"name" : "myLong",
"type" : "long"
}, {
"name" : "myfloat",
"type" : "float"
}, {
"name" : "mydouble",
"type" : "double"
} ]
}
[me@myhost ~]$ avro-tools tojson part-00000-31b5df6e-2ab4-4310-b4dc-11346dc7fc88-c000.avro
{"mystring":{"string":"a"},"myboolean":true,"myint":1,"myLong":1,"myfloat":0.14285715,"mydouble":0.1428571492433548}
[me@myhost ~]$ spark2-shell
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Spark context Web UI available at http://myhost:4040
Spark context available as 'sc' (master = yarn, app id = application_1570004862164_138201).
Spark session available as 'spark'.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.3.0.cloudera4
/_/
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_141)
Type in expressions to have them evaluated.
Type :help for more information.
scala> val df = spark.read.parquet("myclass_parquet")
df: org.apache.spark.sql.DataFrame = [mystring: string, myboolean: boolean ... 4 more fields]
scala> df.show
+--------+---------+-----+------+----------+------------------+
|mystring|myboolean|myint|myLong| myfloat| mydouble|
+--------+---------+-----+------+----------+------------------+
| a| true| 1| 1|0.14285715|0.1428571492433548|
| b| false| 2| 2| 2.0| 2.0|
+--------+---------+-----+------+----------+------------------+
scala> df.printSchema
root
|-- mystring: string (nullable = true)
|-- myboolean: boolean (nullable = true)
|-- myint: integer (nullable = true)
|-- myLong: long (nullable = true)
|-- myfloat: float (nullable = true)
|-- mydouble: double (nullable = true)
scala> val df = spark.read.format("parquet").load("myclass_parquet")
df: org.apache.spark.sql.DataFrame = [mystring: string, myboolean: boolean ... 4 more fields]
scala> res5.show
+--------+---------+-----+------+----------+------------------+
|mystring|myboolean|myint|myLong| myfloat| mydouble|
+--------+---------+-----+------+----------+------------------+
| a| true| 1| 1|0.14285715|0.1428571492433548|
| b| false| 2| 2| 2.0| 2.0|
+--------+---------+-----+------+----------+------------------+
[me@myhost ~]$ spark2-shell --jars spark-avro_2.11-4.0.0.jar
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Spark context Web UI available at http://myhost:4040
Spark context available as 'sc' (master = yarn, app id = application_1570004862164_138246).
Spark session available as 'spark'.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.3.0.cloudera4
/_/
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_141)
Type in expressions to have them evaluated.
Type :help for more information.
scala> val df = spark.read.avro("myclass_avro")
:23: error: value avro is not a member of org.apache.spark.sql.DataFrameReader
val df = spark.read.avro("myclass_avro")
^
scala> val df = spark.read.format("avro").load("myclass_avro")
org.apache.spark.sql.AnalysisException: Failed to find data source: avro. Please find an Avro package at http://spark.apache.org/third-party-projects.html;
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:634)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:190)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:174)
... 49 elided
scala> val df = spark.read.format("com.databricks.spark.avro").load("myclass_avro")
df: org.apache.spark.sql.DataFrame = [mystring: string, myboolean: boolean ... 4 more fields]
scala> df.show
+--------+---------+-----+------+----------+------------------+
|mystring|myboolean|myint|myLong| myfloat| mydouble|
+--------+---------+-----+------+----------+------------------+
| a| true| 1| 1|0.14285715|0.1428571492433548|
| b| false| 2| 2| 2.0| 2.0|
+--------+---------+-----+------+----------+------------------+
scala> df.printSchema
root
|-- mystring: string (nullable = true)
|-- myboolean: boolean (nullable = true)
|-- myint: integer (nullable = true)
|-- myLong: long (nullable = true)
|-- myfloat: float (nullable = true)
|-- mydouble: double (nullable = true)
scala> import com.databricks.spark.avro._
import com.databricks.spark.avro._
scala> val df = spark.read.avro("myclass_avro")
df: org.apache.spark.sql.DataFrame = [mystring: string, myboolean: boolean ... 4 more fields]
scala> df.show
+--------+---------+-----+------+----------+------------------+
|mystring|myboolean|myint|myLong| myfloat| mydouble|
+--------+---------+-----+------+----------+------------------+
| a| true| 1| 1|0.14285715|0.1428571492433548|
| b| false| 2| 2| 2.0| 2.0|
+--------+---------+-----+------+----------+------------------+
scala> df.printSchema
root
|-- mystring: string (nullable = true)
|-- myboolean: boolean (nullable = true)
|-- myint: integer (nullable = true)
|-- myLong: long (nullable = true)
|-- myfloat: float (nullable = true)
|-- mydouble: double (nullable = true)