415 lines
16 KiB
Python
415 lines
16 KiB
Python
#
|
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
#
|
|
|
|
import json
|
|
import os
|
|
import random
|
|
import shutil
|
|
import string
|
|
from pathlib import Path
|
|
from typing import Any, List, Mapping, Tuple
|
|
|
|
import pendulum
|
|
import pytest
|
|
from smart_open import open as smart_open
|
|
from source_s3.source_files_abstract.file_info import FileInfo
|
|
from source_s3.source_files_abstract.formats.csv_parser import CsvParser
|
|
|
|
from .abstract_test_parser import AbstractTestParser, memory_limit
|
|
from .conftest import TMP_FOLDER
|
|
|
|
SAMPLE_DIRECTORY = Path(__file__).resolve().parent.joinpath("sample_files/")
|
|
|
|
# All possible CSV data types
|
|
CSV_TYPES = {
|
|
# logical_type: (json_type, csv_types, convert_function)
|
|
# standard types
|
|
"string": ("string", ["string"], None),
|
|
"boolean": ("boolean", ["boolean"], None),
|
|
"number": ("number", ["number"], None),
|
|
"integer": ("integer", ["integer"], None),
|
|
}
|
|
|
|
|
|
def _generate_value(typ: str) -> Any:
|
|
if typ == "string":
|
|
if AbstractTestParser._generate_value("boolean"):
|
|
return None
|
|
random_length = random.randint(0, 512)
|
|
return "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(random_length))
|
|
return AbstractTestParser._generate_value(typ)
|
|
|
|
|
|
def _generate_row(types: List[str]) -> List[Any]:
|
|
"""Generates random values with request types"""
|
|
row = []
|
|
for needed_type in types:
|
|
for json_type in CSV_TYPES:
|
|
if json_type == needed_type:
|
|
value = _generate_value(needed_type)
|
|
if value is None:
|
|
value = ""
|
|
row.append(str(value))
|
|
break
|
|
return row
|
|
|
|
|
|
def generate_csv_file(filename: str, columns: Mapping[str, str], num_rows: int, delimiter: str) -> str:
|
|
"""Generates a random CSV data and save it to a tmp file"""
|
|
header_line = delimiter.join(columns.keys())
|
|
types = list(columns.values()) if num_rows else []
|
|
with open(filename, "w") as f:
|
|
f.write(header_line + "\n")
|
|
for _ in range(num_rows):
|
|
f.write(delimiter.join(_generate_row(types)) + "\n")
|
|
return filename
|
|
|
|
|
|
def generate_big_file(filepath: str, size_in_gigabytes: float, columns_number: int, template_file: str = None) -> Tuple[dict, float]:
|
|
temp_files = [filepath + ".1", filepath + ".2"]
|
|
if template_file:
|
|
shutil.copyfile(template_file, filepath)
|
|
schema = None
|
|
else:
|
|
schema = {f"column {i}": random.choice(["integer", "string", "boolean", "number"]) for i in range(columns_number)}
|
|
generate_csv_file(filepath, schema, 456, ",")
|
|
|
|
skip_headers = False
|
|
with open(filepath, "r") as f:
|
|
with open(temp_files[0], "w") as tf:
|
|
for line in f:
|
|
if not skip_headers:
|
|
skip_headers = True
|
|
continue
|
|
tf.write(str(line))
|
|
|
|
with open(filepath, "ab") as f:
|
|
while True:
|
|
file_size = os.stat(filepath).st_size / (1024**3)
|
|
if file_size > size_in_gigabytes:
|
|
break
|
|
with open(temp_files[0], "rb") as tf: # type: ignore[assignment]
|
|
with open(temp_files[1], "wb") as tf2:
|
|
buf = tf.read(50 * 1024**2) # by 50Mb
|
|
if buf:
|
|
f.write(buf) # type: ignore[arg-type]
|
|
tf2.write(buf) # type: ignore[arg-type]
|
|
temp_files.append(temp_files.pop(0))
|
|
# remove temp files
|
|
for temp_file in temp_files:
|
|
if os.path.exists(temp_file):
|
|
os.remove(temp_file)
|
|
return schema, file_size
|
|
|
|
|
|
class TestCsvParser(AbstractTestParser):
|
|
record_types = CSV_TYPES
|
|
filetype = "csv"
|
|
|
|
@classmethod
|
|
def cases(cls) -> Mapping[str, Any]:
|
|
return {
|
|
"basic_normal_test": {
|
|
"AbstractFileParser": CsvParser(
|
|
format={"filetype": "csv"},
|
|
master_schema={
|
|
"id": "integer",
|
|
"name": "string",
|
|
"valid": "boolean",
|
|
"code": "integer",
|
|
"degrees": "number",
|
|
"birthday": "string",
|
|
"last_seen": "string",
|
|
},
|
|
),
|
|
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_1.csv"),
|
|
"num_records": 8,
|
|
"inferred_schema": {
|
|
"id": "integer",
|
|
"name": "string",
|
|
"valid": "boolean",
|
|
"code": "integer",
|
|
"degrees": "number",
|
|
"birthday": "string",
|
|
"last_seen": "string",
|
|
},
|
|
"line_checks": {},
|
|
"fails": [],
|
|
},
|
|
"custom_csv_parameters": {
|
|
# tests custom CSV parameters (odd delimiter, quote_char, escape_char & newlines in values in the file)
|
|
"AbstractFileParser": CsvParser(
|
|
format={"filetype": "csv", "delimiter": "^", "quote_char": "|", "escape_char": "!", "newlines_in_values": True},
|
|
master_schema={
|
|
"id": "integer",
|
|
"name": "string",
|
|
"valid": "boolean",
|
|
"code": "integer",
|
|
"degrees": "number",
|
|
"birthday": "string",
|
|
"last_seen": "string",
|
|
},
|
|
),
|
|
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_2_params.csv"),
|
|
"num_records": 8,
|
|
"inferred_schema": {
|
|
"id": "integer",
|
|
"name": "string",
|
|
"valid": "boolean",
|
|
"code": "integer",
|
|
"degrees": "number",
|
|
"birthday": "string",
|
|
"last_seen": "string",
|
|
},
|
|
"line_checks": {},
|
|
"fails": [],
|
|
},
|
|
"encoding_Big5": {
|
|
# tests encoding: Big5
|
|
"AbstractFileParser": CsvParser(
|
|
format={"filetype": "csv", "encoding": "big5"}, master_schema={"id": "integer", "name": "string", "valid": "boolean"}
|
|
),
|
|
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_3_enc_Big5.csv"),
|
|
"num_records": 8,
|
|
"inferred_schema": {"id": "integer", "name": "string", "valid": "boolean"},
|
|
"line_checks": {
|
|
3: {
|
|
"id": 3,
|
|
"name": "變形金剛,偽裝的機器人",
|
|
"valid": False,
|
|
}
|
|
},
|
|
"fails": [],
|
|
},
|
|
"encoding_Arabic_(Windows 1256)": {
|
|
# tests encoding: Arabic (Windows 1256)
|
|
"AbstractFileParser": CsvParser(
|
|
format={"filetype": "csv", "encoding": "windows-1256"},
|
|
master_schema={"id": "integer", "notes": "string", "valid": "boolean"},
|
|
),
|
|
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_4_enc_Arabic.csv"),
|
|
"num_records": 2,
|
|
"inferred_schema": {"id": "integer", "notes": "string", "valid": "boolean"},
|
|
"line_checks": {
|
|
1: {
|
|
"id": 1,
|
|
"notes": "البايت الجوي هو الأفضل",
|
|
"valid": False,
|
|
}
|
|
},
|
|
"fails": [],
|
|
},
|
|
"compression_gzip": {
|
|
# tests compression: gzip
|
|
"AbstractFileParser": CsvParser(
|
|
format={"filetype": "csv"},
|
|
master_schema={
|
|
"id": "integer",
|
|
"name": "string",
|
|
"valid": "boolean",
|
|
"code": "integer",
|
|
"degrees": "number",
|
|
"birthday": "string",
|
|
"last_seen": "string",
|
|
},
|
|
),
|
|
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_5.csv.gz"),
|
|
"num_records": 8,
|
|
"inferred_schema": {
|
|
"id": "integer",
|
|
"name": "string",
|
|
"valid": "boolean",
|
|
"code": "integer",
|
|
"degrees": "number",
|
|
"birthday": "string",
|
|
"last_seen": "string",
|
|
},
|
|
"line_checks": {
|
|
7: {
|
|
"id": 7,
|
|
"name": "xZhh1Kyl",
|
|
"valid": False,
|
|
"code": 10,
|
|
"degrees": -9.2,
|
|
"birthday": "2021-07-14",
|
|
"last_seen": "2021-07-14 15:30:09.225145",
|
|
}
|
|
},
|
|
"fails": [],
|
|
},
|
|
"compression_bz2": {
|
|
# tests compression: bz2
|
|
"AbstractFileParser": CsvParser(
|
|
format={"filetype": "csv"},
|
|
master_schema={
|
|
"id": "integer",
|
|
"name": "string",
|
|
"valid": "boolean",
|
|
"code": "integer",
|
|
"degrees": "number",
|
|
"birthday": "string",
|
|
"last_seen": "string",
|
|
},
|
|
),
|
|
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_7_bz2.csv.bz2"),
|
|
"num_records": 8,
|
|
"inferred_schema": {
|
|
"id": "integer",
|
|
"name": "string",
|
|
"valid": "boolean",
|
|
"code": "integer",
|
|
"degrees": "number",
|
|
"birthday": "string",
|
|
"last_seen": "string",
|
|
},
|
|
"line_checks": {
|
|
7: {
|
|
"id": 7,
|
|
"name": "xZhh1Kyl",
|
|
"valid": False,
|
|
"code": 10,
|
|
"degrees": -9.2,
|
|
"birthday": "2021-07-14",
|
|
"last_seen": "2021-07-14 15:30:09.225145",
|
|
}
|
|
},
|
|
"fails": [],
|
|
},
|
|
"extra_columns_in_master_schema": {
|
|
# tests extra columns in master schema
|
|
"AbstractFileParser": CsvParser(
|
|
format={"filetype": "csv"},
|
|
master_schema={
|
|
"EXTRA_COLUMN_1": "boolean",
|
|
"EXTRA_COLUMN_2": "number",
|
|
"id": "integer",
|
|
"name": "string",
|
|
"valid": "boolean",
|
|
"code": "integer",
|
|
"degrees": "number",
|
|
"birthday": "string",
|
|
"last_seen": "string",
|
|
},
|
|
),
|
|
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_1.csv"),
|
|
"num_records": 8,
|
|
"inferred_schema": {
|
|
"id": "integer",
|
|
"name": "string",
|
|
"valid": "boolean",
|
|
"code": "integer",
|
|
"degrees": "number",
|
|
"birthday": "string",
|
|
"last_seen": "string",
|
|
},
|
|
"line_checks": {},
|
|
"fails": [],
|
|
},
|
|
"missing_columns_in_master_schema": {
|
|
# tests missing columns in master schema
|
|
# TODO: maybe this should fail read_records, but it does pick up all the columns from file despite missing from master schema
|
|
"AbstractFileParser": CsvParser(format={"filetype": "csv"}, master_schema={"id": "integer", "name": "string"}),
|
|
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_1.csv"),
|
|
"num_records": 8,
|
|
"inferred_schema": {
|
|
"id": "integer",
|
|
"name": "string",
|
|
"valid": "boolean",
|
|
"code": "integer",
|
|
"degrees": "number",
|
|
"birthday": "string",
|
|
"last_seen": "string",
|
|
},
|
|
"line_checks": {},
|
|
"fails": [],
|
|
},
|
|
"empty_csv_file": {
|
|
# tests empty file, SHOULD FAIL INFER & STREAM RECORDS
|
|
"AbstractFileParser": CsvParser(format={"filetype": "csv"}, master_schema={}),
|
|
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_6_empty.csv"),
|
|
"num_records": 0,
|
|
"inferred_schema": {},
|
|
"line_checks": {},
|
|
"fails": ["test_get_inferred_schema", "test_stream_records"],
|
|
},
|
|
"empty_advanced_options": {
|
|
"AbstractFileParser": CsvParser(
|
|
format={"filetype": "csv", "advanced_options": ""},
|
|
master_schema={
|
|
"id": "integer",
|
|
"name": "string",
|
|
"valid": "boolean",
|
|
"code": "integer",
|
|
"degrees": "number",
|
|
"birthday": "string",
|
|
"last_seen": "string",
|
|
},
|
|
),
|
|
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_1.csv"),
|
|
"num_records": 8,
|
|
"inferred_schema": {
|
|
"id": "integer",
|
|
"name": "string",
|
|
"valid": "boolean",
|
|
"code": "integer",
|
|
"degrees": "number",
|
|
"birthday": "string",
|
|
"last_seen": "string",
|
|
},
|
|
"line_checks": {},
|
|
"fails": [],
|
|
},
|
|
"no_header_csv_file": {
|
|
# no header test
|
|
"AbstractFileParser": CsvParser(
|
|
format={
|
|
"filetype": "csv",
|
|
"advanced_options": json.dumps(
|
|
{"column_names": ["id", "name", "valid", "code", "degrees", "birthday", "last_seen"]}
|
|
),
|
|
},
|
|
master_schema={},
|
|
),
|
|
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_8_no_header.csv"),
|
|
"num_records": 8,
|
|
"inferred_schema": {
|
|
"id": "integer",
|
|
"name": "string",
|
|
"valid": "boolean",
|
|
"code": "integer",
|
|
"degrees": "number",
|
|
"birthday": "string",
|
|
"last_seen": "string",
|
|
},
|
|
"line_checks": {},
|
|
"fails": [],
|
|
},
|
|
}
|
|
|
|
@memory_limit(20)
|
|
@pytest.mark.order(1)
|
|
def test_big_file(self) -> None:
|
|
"""tests a big csv file (>= 1.5G records)"""
|
|
filepath = os.path.join(TMP_FOLDER, "big_csv_file." + self.filetype)
|
|
schema, file_size = generate_big_file(filepath, 0.1, 123)
|
|
expected_count = sum(1 for _ in open(filepath)) - 1
|
|
self.logger.info(f"generated file {filepath} with size {file_size}Gb, lines: {expected_count}")
|
|
for _ in range(3):
|
|
parser = CsvParser(
|
|
format={"filetype": self.filetype, "block_size": 5 * 1024**2},
|
|
master_schema=schema,
|
|
)
|
|
expected_file = open(filepath, "r")
|
|
# skip the first header line
|
|
next(expected_file)
|
|
read_count = 0
|
|
with smart_open(filepath, self._get_readmode({"AbstractFileParser": parser})) as f:
|
|
for record in parser.stream_records(f, FileInfo(key=filepath, size=file_size, last_modified=pendulum.now())):
|
|
record_line = ",".join("" if v is None else str(v) for v in record.values())
|
|
expected_line = next(expected_file).strip("\n")
|
|
assert record_line == expected_line
|
|
read_count += 1
|
|
assert read_count == expected_count
|
|
expected_file.close()
|