# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Tests for IMPALA-1658

import os
import pytest
from subprocess import check_call

from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
from tests.common.file_utils import create_table_from_parquet
from tests.util.filesystem_utils import get_fs_path

class TestHiveParquetTimestampConversion(CustomClusterTestSuite):
  '''Hive writes timestamps in parquet files by first converting values from local time
     to UTC. The conversion was not expected (other file formats don't convert) and a
     startup flag was later added to adjust for this (IMPALA-1658). This file tests that
     the conversion and flag behave as expected.
  '''

  @classmethod
  def add_test_dimensions(cls):
    super(CustomClusterTestSuite, cls).add_test_dimensions()
    cls.ImpalaTestMatrix.add_constraint(lambda v:
        v.get_value('table_format').file_format == 'parquet' and
        v.get_value('table_format').compression_codec == 'none')

  @classmethod
  def get_workload(self):
    return 'functional-query'

  def check_sanity(self, expect_converted_result):
    data = self.execute_query_expect_success(self.client, """
        SELECT COUNT(timestamp_col), COUNT(DISTINCT timestamp_col),
               MIN(timestamp_col), MAX(timestamp_col)
        FROM functional_parquet.alltypesagg_hive_13_1""",
        query_options={"timezone": "PST8PDT"})\
        .get_data()
    assert len(data) > 0
    rows = data.split("\n")
    assert len(rows) == 1
    values = rows[0].split("\t")
    assert len(values) == 4
    assert values[0] == "11000"
    assert values[1] == "10000"
    if expect_converted_result:
      # Doing easy time zone conversion in python seems to require a 3rd party lib,
      # so the only check will be that the value changed in some way.
      assert values[2] != "2010-01-01 00:00:00"
      assert values[3] != "2010-01-10 18:02:05.100000000"
    else:
      assert values[2] == "2010-01-01 00:00:00"
      assert values[3] == "2010-01-10 18:02:05.100000000"

  @pytest.mark.execute_serially
  @CustomClusterTestSuite.with_args("-convert_legacy_hive_parquet_utc_timestamps=true "
      "-hdfs_zone_info_zip=%s" % get_fs_path("/test-warehouse/tzdb/2017c.zip"))
  def test_conversion(self, vector, unique_database):
    self.check_sanity(True)
    self._test_conversion_with_validation(vector, unique_database)
    # Override query option convert_legacy_hive_parquet_utc_timestamps.
    query_options = {"timezone": "PST8PDT",
        "convert_legacy_hive_parquet_utc_timestamps": "0"}
    self._test_no_conversion(vector, query_options, "PST8PDT")

    # Test with UTC too to check the optimizations added in IMPALA-9385.
    for tz_name in ["PST8PDT", "UTC"]:
      # The value read from the Hive table should be the same as reading a UTC converted
      # value from the Impala table.
      data = self.execute_query_expect_success(self.client, """
          SELECT h.id, h.day, h.timestamp_col, i.timestamp_col
          FROM functional_parquet.alltypesagg_hive_13_1 h
          JOIN functional_parquet.alltypesagg
            i ON i.id = h.id AND i.day = h.day  -- serves as a unique key
          WHERE
            (h.timestamp_col IS NULL AND i.timestamp_col IS NOT NULL)
            OR (h.timestamp_col IS NOT NULL AND i.timestamp_col IS NULL)
            OR h.timestamp_col != FROM_UTC_TIMESTAMP(i.timestamp_col, '%s')
          """ % tz_name, query_options={"timezone": tz_name})\
          .get_data()
      assert len(data) == 0

  def _test_conversion_with_validation(self, vector, unique_database):
    """Test that timestamp validation also works as expected when converting timestamps.
    Runs as part of test_conversion() to avoid restarting the cluster."""
    create_table_from_parquet(self.client, unique_database,
                              "out_of_range_timestamp_hive_211")
    create_table_from_parquet(self.client, unique_database,
                              "out_of_range_timestamp2_hive_211")
    # Allow tests to override abort_or_error
    del vector.get_value('exec_option')['abort_on_error']
    self.run_test_case('QueryTest/out-of-range-timestamp-local-tz-conversion',
         vector, unique_database)

  @pytest.mark.execute_serially
  @CustomClusterTestSuite.with_args("-convert_legacy_hive_parquet_utc_timestamps=false "
      "-hdfs_zone_info_zip=%s" % get_fs_path("/test-warehouse/tzdb/2017c.zip"))
  def test_no_conversion(self, vector):
    self.check_sanity(False)
    # Do not override query option convert_legacy_hive_parquet_utc_timestamps.
    query_options = {"timezone": "PST8PDT"}
    self._test_no_conversion(vector, query_options, "PST8PDT")

  def _test_no_conversion(self, vector, query_options, tz_name):
    # Without conversion all the values will be different.

    data = self.execute_query_expect_success(self.client, """
        SELECT h.id, h.day, h.timestamp_col, i.timestamp_col
        FROM functional_parquet.alltypesagg_hive_13_1 h
        JOIN functional_parquet.alltypesagg
          i ON i.id = h.id AND i.day = h.day  -- serves as a unique key
        WHERE h.timestamp_col != FROM_UTC_TIMESTAMP(i.timestamp_col, '%s')
        """ % tz_name, query_options=query_options)\
        .get_data()
    assert len(data.split('\n')) == 10000
    # A value should either stay null or stay not null.
    data = self.execute_query_expect_success(self.client, """
        SELECT h.id, h.day, h.timestamp_col, i.timestamp_col
        FROM functional_parquet.alltypesagg_hive_13_1 h
        JOIN functional_parquet.alltypesagg
          i ON i.id = h.id AND i.day = h.day  -- serves as a unique key
        WHERE
          (h.timestamp_col IS NULL AND i.timestamp_col IS NOT NULL)
          OR (h.timestamp_col IS NOT NULL AND i.timestamp_col IS NULL)
        """, query_options=query_options)\
        .get_data()
    assert len(data) == 0

  def _test_stat_filtering(self, vector, unique_database):
    """ IMPALA-7559: Check that Parquet stat filtering doesn't skip row groups
        incorrectly when timezone conversion is needed.
        Runs as part of test_conversion() to avoid restarting the cluster.
    """
    self.client.execute(
       "create table %s.t (i int, d timestamp) stored as parquet" % unique_database)

    tbl_loc = get_fs_path("/test-warehouse/%s.db/t" % unique_database)
    self.filesystem_client.copy_from_local(os.environ['IMPALA_HOME'] +
        "/testdata/data/hive_single_value_timestamp.parq", tbl_loc)

    # TODO: other tests in this file could also use query option 'timezone' to enable
    #       real data validation
    data = self.execute_query_expect_success(self.client,
        'select * from %s.t' % unique_database,
        query_options={"timezone": "CET"}).get_data()
    assert data == '1\t2018-10-01 02:30:00'

    # This query returned 0 rows before the fix for IMPALA-7559.
    data = self.execute_query_expect_success(self.client,
        'select * from %s.t where d = "2018-10-01 02:30:00"' % unique_database,
        query_options={"timezone": "CET"}).get_data()
    assert data == '1\t2018-10-01 02:30:00'