Author: Bartosz Wieczorek

Java 8 Lambda returning lambda part 2

        Function<Integer, Function<Integer, Integer>> f = i1 -> i2 -> i1 - i2;
        List<Integer> ints = Arrays.asList(1, 2, 3);
        Integer res = 0;
        for (int ii = ints.size() - 1; ii >= 0; ii--) {
            Integer t = ints.get(ii); // 3 // 2 // 1
            Function<Integer, Integer> ff = f.apply(t); // i2 -> 3 - i2 // i2 -> 2 - i2 // i2 -> 1 - i2
            res = ff.apply(res); // 3 - 0 = 3 // 2 - 3 = -1 // 1 - -1 = 2
            System.out.println("intermediate res=" + res);
        }
        System.out.println("final res=" + res); // 2

Output:

intermediate res=3
intermediate res=-1
intermediate res=2
final res=2

Java 8 Lambda returning lambda

import java.util.Arrays;
import java.util.List;
import java.util.function.Function;
import java.util.function.Predicate;

public class LamdaReturningLambda {

    public static void main(String[] args) {
        List<String> strings = Arrays.asList("Bob", "Alice");

        // first approach
        strings
            .stream()
            .filter(e -> e.contains("i"))
            .filter(e -> e.contains("c"))
            .forEach(System.out::println);


        // second approach
        Predicate<String> contains_i = name -> name.contains("i");
        Predicate<String> contains_c = name -> name.contains("c");

        strings
            .stream()
            .filter(contains_i)
            .filter(contains_c)
            .forEach(System.out::println);


        // third approach
        strings
            .stream()
            .filter(containsString("i"))
            .filter(containsString("c"))
            .forEach(System.out::println);

        // fourth approach
        Function<String, Predicate<String>> containsFunc = s -> name -> name.contains(s); // lambda returning predicate
        strings
            .stream()
            .filter(containsFunc.apply("i"))
            .filter(containsFunc.apply("c"))
            .forEach(System.out::println);
    }

    static Predicate<String> containsString(String s) {
        return name -> name.contains(s);
    }

}

Output:

Alice
Alice
Alice
Alice

Java 8 jdk8 constructor reference

static class Person {
        private int id;

        public Person(int id) {
            this.id = id;
        }

        @Override
        public String toString() {
            return "Person{id=" + id + '}';
        }
    }

    public static void main(String[] args) {
        IntStream
                .range(0, 3)
                // .map(n -> new Person(n)) // does not compile since lambda return type is expected to be int
                .mapToObj(Person::new) // alternative to .mapToObj(n -> new Person(n))
                .forEach(System.out::println);
    }

Output:

Person{id=0}
Person{id=1}
Person{id=2}

Understanding java jdk8 Optional

        System.out.println( Optional.empty() ); // Optional.empty
        System.out.println( Optional.ofNullable(null) ); // Optional.empty
        System.out.println( Optional.ofNullable("a") ); // Optional[a]
        System.out.println( Optional.ofNullable("a").get() ); // a


        System.out.println( Optional.ofNullable("a").filter(s -> true) ); // Optional[a] (filter match)
        System.out.println( Optional.ofNullable("a").filter(s -> false) ); // Optional.empty (filter not match)

        System.out.println( Optional.ofNullable("a").map(s -> null)); // Optional.empty (mapper function returning null)
        System.out.println( Optional.ofNullable("a").map(s -> "b")); // Optional[b] (re-map)
        System.out.println( Optional.ofNullable("a").map(s -> "b".equals(s) ? s : null) ); // Optional.empty (filtering using map)
        System.out.println( Optional.ofNullable("a").map(s -> Optional.of("b")) ); // Optional[Optional[b]] (mapper function returning optional)
        System.out.println( Optional.empty().map(s -> { System.out.print("map"); return "b"; }) ); // Optional.empty (mapper not run)

        System.out.println( Optional.ofNullable("a").flatMap(s -> Optional.of("b")) ); // Optional[b] (mapper function returning optional)

        System.out.println( Optional.of("a").orElse("c") ); // a
        System.out.println( Optional.empty().orElse("c") ); // c


        System.out.println( Optional.ofNullable(null)
                .map(s -> printAndReturn("map: ", "b"))
                .orElse(printAndReturn("orElse: ", "c")) ); // orElse: c (executed only orElse branch, returned value c)

        System.out.println( Optional.of("a")
                                    .map(s -> printAndReturn("map: ", "b"))
                                    .orElse(printAndReturn("orElse: ", "c")) ); // map: orElse: b (executed both branches, returned value b)

        System.out.println( Optional.of("a")
                .map(s -> printAndReturn("map: ", "b"))
                .orElseGet(() -> printAndReturn("orElse: ", "c")) ); // map: b (executed only map branch, returned value b)

Hibernate n+1 selects and lazy loading vs eager

Assuming a shop can have many products we want to display shops along with products:
1) with one select query using join – we need to annotate in @OneToMany fetchType to EAGER to produce:

Hibernate:
select
this_.id as id1_1_1_,
this_.shipment_address as shipment2_1_1_,
products2_.shop_id as shop_id4_0_3_,
products2_.id as id1_0_3_,
products2_.id as id1_0_0_,
products2_.name as name2_0_0_,
products2_.price as price3_0_0_,
products2_.shop_id as shop_id4_0_0_
from
Shop this_
left outer join
Product products2_
on this_.id=products2_.shop_id

when calling

            Session session = sessionFactory.openSession();
            Criteria criteria = session.createCriteria(Shop.class).setResultTransformer(Criteria.DISTINCT_ROOT_ENTITY);
            List list = criteria.list();

2) when we set fetch type to LAZY (default) then we get one select for shops and optionally additional selects for products for particular shop(s):

Hibernate:
select
this_.id as id1_1_0_,
this_.shipment_address as shipment2_1_0_
from
Shop this_

when retrieving shops
and when we want to retrieve products for particular shop(s)

            Shop shop = shops.get(0);
            List products = shop.getProducts();

then we get:

Hibernate:
select
products0_.shop_id as shop_id4_0_0_,
products0_.id as id1_0_0_,
products0_.id as id1_0_1_,
products0_.name as name2_0_1_,
products0_.price as price3_0_1_,
products0_.shop_id as shop_id4_0_1_
from
Product products0_
where
products0_.shop_id=?

for code

@Entity
public class Shop extends BaseEntity {

    @Column(name = "shipment_address", nullable = false)
    private String shipmentAddress;

    @OneToMany(mappedBy = "shop", fetch = FetchType.LAZY, cascade = CascadeType.ALL)
    private List products = new ArrayList<>();

    Shop() {
        // default constructor needed for ORM
    }

    public Shop(String shipmentAddress, Product... products) {
        this.shipmentAddress = shipmentAddress;
        for (Product product : products) {
            product.addShop(this);
            this.products.add(product);
        }
    }

    public List getProducts() {
        return products;
    }

    @Override
    public String toString() {
        return "Shop{" +
                super.toString() +
                ", shipmentAddress='" + shipmentAddress + '\'' +
                //", products=" + products +
                '}';
    }
}

@Entity
public class Product extends BaseEntity {

    @Column
    private String name;

    @Column
    private BigDecimal price;

    @ManyToOne
    @JoinColumn(name = "shop_id", foreignKey = @ForeignKey(name = "fk_product_shop"))
    private Shop shop;

    Product() {
        // default constructor needed for ORM
    }

    public Product(String name, BigDecimal price) {
        this.name = name;
        this.price = price;
    }

    void addShop(Shop shop) {
        this.shop = shop;
    }

    @Override
    public String toString() {
        return "Product{" +
                super.toString() +
                ", name='" + name + '\'' +
                ", price=" + price +
                ", shop.id=" + shop.getId() + '}';
    }
}

Hive external table creation, data partitioning and avro schema evolution

Read first: https://bmwieczorek.wordpress.com/2017/05/29/unions-and-default-value-in-apache-avro-serialization-and-deserialization/

Create avro schema user.avsc for User with single string property name and serialize it to users.avro and upload it to hdfs:

[cloudera@quickstart ~]$ hdfs dfs -copyFromLocal /media/sf_vbox-shared/user.avsc /schema/
[cloudera@quickstart ~]$ hdfs dfs -mkdir -p /data/users/year=2017/month=05/day=24/hour=09
[cloudera@quickstart ~]$ hdfs dfs -copyFromLocal /media/sf_vbox-shared/users/year\=2017/month\=05/day\=24/hour\=09/users.avro /data/users/year=2017/month=05/day=24/hour=09/

Connect to Hive via beeline, create external table, load it with users.avro file, repair it via msck repair table and display the hive table content:

[cloudera@quickstart ~]$ beeline
Beeline version 1.1.0-cdh5.10.0 by Apache Hive
beeline> !connect jdbc:hive2://localhost:10000
scan complete in 1ms
Connecting to jdbc:hive2://localhost:10000
Enter username for jdbc:hive2://localhost:10000: cloudera
Enter password for jdbc:hive2://localhost:10000: ********
Connected to: Apache Hive (version 1.1.0-cdh5.10.0)
Driver: Hive JDBC (version 1.1.0-cdh5.10.0)
Transaction isolation: TRANSACTION_REPEATABLE_READ
0: jdbc:hive2://localhost:10000> CREATE EXTERNAL TABLE Users PARTITIONED BY (year String, month String, day String, hour String) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' LOCATION '/data/users' TBLPROPERTIES ('avro.schema.url'='hdfs:///schema/user.avsc');
INFO  : Compiling command(queryId=hive_20170530023737_04121f8c-8e27-4b6f-8816-6fa8ccf7d993): CREATE EXTERNAL TABLE Users PARTITIONED BY (year String, month String, day String, hour String) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' LOCATION '/data/users' TBLPROPERTIES ('avro.schema.url'='hdfs:///schema/user.avsc')
INFO  : Semantic Analysis Completed
INFO  : Returning Hive schema: Schema(fieldSchemas:null, properties:null)
INFO  : Completed compiling command(queryId=hive_20170530023737_04121f8c-8e27-4b6f-8816-6fa8ccf7d993); Time taken: 0.028 seconds
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Executing command(queryId=hive_20170530023737_04121f8c-8e27-4b6f-8816-6fa8ccf7d993): CREATE EXTERNAL TABLE Users PARTITIONED BY (year String, month String, day String, hour String) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' LOCATION '/data/users' TBLPROPERTIES ('avro.schema.url'='hdfs:///schema/user.avsc')
INFO  : Starting task [Stage-0:DDL] in serial mode
INFO  : Completed executing command(queryId=hive_20170530023737_04121f8c-8e27-4b6f-8816-6fa8ccf7d993); Time taken: 0.073 seconds
INFO  : OK
No rows affected (0.117 seconds)
0: jdbc:hive2://localhost:10000> SELECT * FROM Users;
INFO  : Compiling command(queryId=hive_20170530023838_cc15d926-28e7-477d-8de3-948a1a7c00a3): SELECT * FROM Users
INFO  : Semantic Analysis Completed
INFO  : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:users.name, type:string, comment:null), FieldSchema(name:users.year, type:string, comment:null), FieldSchema(name:users.month, type:string, comment:null), FieldSchema(name:users.day, type:string, comment:null), FieldSchema(name:users.hour, type:string, comment:null)], properties:null)
INFO  : Completed compiling command(queryId=hive_20170530023838_cc15d926-28e7-477d-8de3-948a1a7c00a3); Time taken: 0.087 seconds
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Executing command(queryId=hive_20170530023838_cc15d926-28e7-477d-8de3-948a1a7c00a3): SELECT * FROM Users
INFO  : Completed executing command(queryId=hive_20170530023838_cc15d926-28e7-477d-8de3-948a1a7c00a3); Time taken: 0.0 seconds
INFO  : OK
+-------------+-------------+--------------+------------+-------------+--+
| users.name  | users.year  | users.month  | users.day  | users.hour  |
+-------------+-------------+--------------+------------+-------------+--+
+-------------+-------------+--------------+------------+-------------+--+
No rows selected (0.128 seconds)
0: jdbc:hive2://localhost:10000> MSCK REPAIR TABLE Users;
INFO  : Compiling command(queryId=hive_20170530023939_b04b73a2-e03e-4b4b-826d-e00214ffbe50): MSCK REPAIR TABLE Users
INFO  : Semantic Analysis Completed
INFO  : Returning Hive schema: Schema(fieldSchemas:null, properties:null)
INFO  : Completed compiling command(queryId=hive_20170530023939_b04b73a2-e03e-4b4b-826d-e00214ffbe50); Time taken: 0.004 seconds
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Executing command(queryId=hive_20170530023939_b04b73a2-e03e-4b4b-826d-e00214ffbe50): MSCK REPAIR TABLE Users
INFO  : Starting task [Stage-0:DDL] in serial mode
INFO  : Completed executing command(queryId=hive_20170530023939_b04b73a2-e03e-4b4b-826d-e00214ffbe50); Time taken: 0.155 seconds
INFO  : OK
No rows affected (0.175 seconds)
0: jdbc:hive2://localhost:10000> SELECT * FROM Users;
INFO  : Compiling command(queryId=hive_20170530023939_392a3a78-0f40-45b7-b227-fa6cfdc69fce): SELECT * FROM Users
INFO  : Semantic Analysis Completed
INFO  : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:users.name, type:string, comment:null), FieldSchema(name:users.year, type:string, comment:null), FieldSchema(name:users.month, type:string, comment:null), FieldSchema(name:users.day, type:string, comment:null), FieldSchema(name:users.hour, type:string, comment:null)], properties:null)
INFO  : Completed compiling command(queryId=hive_20170530023939_392a3a78-0f40-45b7-b227-fa6cfdc69fce); Time taken: 0.064 seconds
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Executing command(queryId=hive_20170530023939_392a3a78-0f40-45b7-b227-fa6cfdc69fce): SELECT * FROM Users
INFO  : Completed executing command(queryId=hive_20170530023939_392a3a78-0f40-45b7-b227-fa6cfdc69fce); Time taken: 0.0 seconds
INFO  : OK
+-------------+-------------+--------------+------------+-------------+--+
| users.name  | users.year  | users.month  | users.day  | users.hour  |
+-------------+-------------+--------------+------------+-------------+--+
| Alyssa      | 2017        | 05           | 24         | 09          |
+-------------+-------------+--------------+------------+-------------+--+

Change the User schema to include additional property favorite_color (union null and string with default null), generate new users.avro file and upload the avro file to hdfs to different partition (different hour)

[cloudera@quickstart dev]$ hdfs dfs -mkdir -p /data/users/year=2017/month=05/day=24/hour=10
[cloudera@quickstart dev]$ hdfs dfs -copyFromLocal /media/sf_vbox-shared/users/year\=2017/month\=05/day\=24/hour\=10/users.avro /data/users/year=2017/month=05/day=24/hour=10/

Repair hive table and select results with all rows but yet without new column favorite_color:

0: jdbc:hive2://localhost:10000> SELECT * FROM Users;
INFO  : Compiling command(queryId=hive_20170530024040_5f1a6880-61b2-4c87-bb35-0c643cd511ba): SELECT * FROM Users
INFO  : Semantic Analysis Completed
INFO  : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:users.name, type:string, comment:null), FieldSchema(name:users.year, type:string, comment:null), FieldSchema(name:users.month, type:string, comment:null), FieldSchema(name:users.day, type:string, comment:null), FieldSchema(name:users.hour, type:string, comment:null)], properties:null)
INFO  : Completed compiling command(queryId=hive_20170530024040_5f1a6880-61b2-4c87-bb35-0c643cd511ba); Time taken: 0.068 seconds
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Executing command(queryId=hive_20170530024040_5f1a6880-61b2-4c87-bb35-0c643cd511ba): SELECT * FROM Users
INFO  : Completed executing command(queryId=hive_20170530024040_5f1a6880-61b2-4c87-bb35-0c643cd511ba); Time taken: 0.0 seconds
INFO  : OK
+-------------+-------------+--------------+------------+-------------+--+
| users.name  | users.year  | users.month  | users.day  | users.hour  |
+-------------+-------------+--------------+------------+-------------+--+
| Alyssa      | 2017        | 05           | 24         | 09          |
+-------------+-------------+--------------+------------+-------------+--+
1 row selected (0.105 seconds)
0: jdbc:hive2://localhost:10000> MSCK REPAIR TABLE Users;
INFO  : Compiling command(queryId=hive_20170530024040_2db31c0d-7f15-4f72-934a-a7b0dfacc045): MSCK REPAIR TABLE Users
INFO  : Semantic Analysis Completed
INFO  : Returning Hive schema: Schema(fieldSchemas:null, properties:null)
INFO  : Completed compiling command(queryId=hive_20170530024040_2db31c0d-7f15-4f72-934a-a7b0dfacc045); Time taken: 0.002 seconds
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Executing command(queryId=hive_20170530024040_2db31c0d-7f15-4f72-934a-a7b0dfacc045): MSCK REPAIR TABLE Users
INFO  : Starting task [Stage-0:DDL] in serial mode
INFO  : Completed executing command(queryId=hive_20170530024040_2db31c0d-7f15-4f72-934a-a7b0dfacc045); Time taken: 0.114 seconds
INFO  : OK
No rows affected (0.128 seconds)
0: jdbc:hive2://localhost:10000> SELECT * FROM Users;
INFO  : Compiling command(queryId=hive_20170530024040_4f0cbff8-4c54-4e72-ada0-c7ea2100be71): SELECT * FROM Users
INFO  : Semantic Analysis Completed
INFO  : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:users.name, type:string, comment:null), FieldSchema(name:users.year, type:string, comment:null), FieldSchema(name:users.month, type:string, comment:null), FieldSchema(name:users.day, type:string, comment:null), FieldSchema(name:users.hour, type:string, comment:null)], properties:null)
INFO  : Completed compiling command(queryId=hive_20170530024040_4f0cbff8-4c54-4e72-ada0-c7ea2100be71); Time taken: 0.07 seconds
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Executing command(queryId=hive_20170530024040_4f0cbff8-4c54-4e72-ada0-c7ea2100be71): SELECT * FROM Users
INFO  : Completed executing command(queryId=hive_20170530024040_4f0cbff8-4c54-4e72-ada0-c7ea2100be71); Time taken: 0.0 seconds
INFO  : OK
+-------------+-------------+--------------+------------+-------------+--+
| users.name  | users.year  | users.month  | users.day  | users.hour  |
+-------------+-------------+--------------+------------+-------------+--+
| Alyssa      | 2017        | 05           | 24         | 09          |
| Alyssa      | 2017        | 05           | 24         | 10          |
+-------------+-------------+--------------+------------+-------------+--+
2 rows selected (0.119 seconds)
0: jdbc:hive2://localhost:10000> --delete old schema and upload new schema with favorite color to hdfs

Delete old schema from hdfs and replace it with new one containing new column

[cloudera@quickstart dev]$ hdfs dfs -rm -skipTrash /schema/user.avsc
Deleted /schema/user.avsc
[cloudera@quickstart dev]$ hdfs dfs -copyFromLocal /media/sf_vbox-shared/schema-with-favorite-color/user.avsc /schema/

Fix hive table and display all results with new column

0: jdbc:hive2://localhost:10000> MSCK REPAIR TABLE Users;
INFO  : Compiling command(queryId=hive_20170530024444_0bb2c55f-7e12-4cde-8154-e77ce491e035): MSCK REPAIR TABLE Users
INFO  : Semantic Analysis Completed
INFO  : Returning Hive schema: Schema(fieldSchemas:null, properties:null)
INFO  : Completed compiling command(queryId=hive_20170530024444_0bb2c55f-7e12-4cde-8154-e77ce491e035); Time taken: 0.001 seconds
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Executing command(queryId=hive_20170530024444_0bb2c55f-7e12-4cde-8154-e77ce491e035): MSCK REPAIR TABLE Users
INFO  : Starting task [Stage-0:DDL] in serial mode
INFO  : Completed executing command(queryId=hive_20170530024444_0bb2c55f-7e12-4cde-8154-e77ce491e035); Time taken: 0.047 seconds
INFO  : OK
No rows affected (0.061 seconds)
0: jdbc:hive2://localhost:10000> SELECT * FROM Users;
INFO  : Compiling command(queryId=hive_20170530024545_e9142a48-92c5-41c9-a6c6-f2a611fa4e4f): SELECT * FROM Users
INFO  : Semantic Analysis Completed
INFO  : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:users.name, type:string, comment:null), FieldSchema(name:users.favorite_number, type:int, comment:null), FieldSchema(name:users.year, type:string, comment:null), FieldSchema(name:users.month, type:string, comment:null), FieldSchema(name:users.day, type:string, comment:null), FieldSchema(name:users.hour, type:string, comment:null)], properties:null)
INFO  : Completed compiling command(queryId=hive_20170530024545_e9142a48-92c5-41c9-a6c6-f2a611fa4e4f); Time taken: 0.059 seconds
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Executing command(queryId=hive_20170530024545_e9142a48-92c5-41c9-a6c6-f2a611fa4e4f): SELECT * FROM Users
INFO  : Completed executing command(queryId=hive_20170530024545_e9142a48-92c5-41c9-a6c6-f2a611fa4e4f); Time taken: 0.0 seconds
INFO  : OK
+-------------+------------------------+-------------+--------------+------------+-------------+--+
| users.name  | users.favorite_number  | users.year  | users.month  | users.day  | users.hour  |
+-------------+------------------------+-------------+--------------+------------+-------------+--+
| Alyssa      | NULL                   | 2017        | 05           | 24         | 09          |
| Alyssa      | 256                    | 2017        | 05           | 24         | 10          |
+-------------+------------------------+-------------+--------------+------------+-------------+--+
2 rows selected (0.118 seconds)
0: jdbc:hive2://localhost:10000>