1
0
mirror of synced 2026-01-04 18:04:31 -05:00
Files
airbyte/airbyte-integrations/connectors/source-s3/unit_tests/test_csv_parser.py
Cole Snodgrass 2e099acc52 update headers from 2022 -> 2023 (#22594)
* It's 2023!

* 2022 -> 2023

---------

Co-authored-by: evantahler <evan@airbyte.io>
2023-02-08 13:01:16 -08:00

415 lines
16 KiB
Python

#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import json
import os
import random
import shutil
import string
from pathlib import Path
from typing import Any, List, Mapping, Tuple
import pendulum
import pytest
from smart_open import open as smart_open
from source_s3.source_files_abstract.file_info import FileInfo
from source_s3.source_files_abstract.formats.csv_parser import CsvParser
from .abstract_test_parser import AbstractTestParser, memory_limit
from .conftest import TMP_FOLDER
SAMPLE_DIRECTORY = Path(__file__).resolve().parent.joinpath("sample_files/")
# All possible CSV data types
CSV_TYPES = {
# logical_type: (json_type, csv_types, convert_function)
# standard types
"string": ("string", ["string"], None),
"boolean": ("boolean", ["boolean"], None),
"number": ("number", ["number"], None),
"integer": ("integer", ["integer"], None),
}
def _generate_value(typ: str) -> Any:
if typ == "string":
if AbstractTestParser._generate_value("boolean"):
return None
random_length = random.randint(0, 512)
return "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(random_length))
return AbstractTestParser._generate_value(typ)
def _generate_row(types: List[str]) -> List[Any]:
"""Generates random values with request types"""
row = []
for needed_type in types:
for json_type in CSV_TYPES:
if json_type == needed_type:
value = _generate_value(needed_type)
if value is None:
value = ""
row.append(str(value))
break
return row
def generate_csv_file(filename: str, columns: Mapping[str, str], num_rows: int, delimiter: str) -> str:
"""Generates a random CSV data and save it to a tmp file"""
header_line = delimiter.join(columns.keys())
types = list(columns.values()) if num_rows else []
with open(filename, "w") as f:
f.write(header_line + "\n")
for _ in range(num_rows):
f.write(delimiter.join(_generate_row(types)) + "\n")
return filename
def generate_big_file(filepath: str, size_in_gigabytes: float, columns_number: int, template_file: str = None) -> Tuple[dict, float]:
temp_files = [filepath + ".1", filepath + ".2"]
if template_file:
shutil.copyfile(template_file, filepath)
schema = None
else:
schema = {f"column {i}": random.choice(["integer", "string", "boolean", "number"]) for i in range(columns_number)}
generate_csv_file(filepath, schema, 456, ",")
skip_headers = False
with open(filepath, "r") as f:
with open(temp_files[0], "w") as tf:
for line in f:
if not skip_headers:
skip_headers = True
continue
tf.write(str(line))
with open(filepath, "ab") as f:
while True:
file_size = os.stat(filepath).st_size / (1024**3)
if file_size > size_in_gigabytes:
break
with open(temp_files[0], "rb") as tf: # type: ignore[assignment]
with open(temp_files[1], "wb") as tf2:
buf = tf.read(50 * 1024**2) # by 50Mb
if buf:
f.write(buf) # type: ignore[arg-type]
tf2.write(buf) # type: ignore[arg-type]
temp_files.append(temp_files.pop(0))
# remove temp files
for temp_file in temp_files:
if os.path.exists(temp_file):
os.remove(temp_file)
return schema, file_size
class TestCsvParser(AbstractTestParser):
record_types = CSV_TYPES
filetype = "csv"
@classmethod
def cases(cls) -> Mapping[str, Any]:
return {
"basic_normal_test": {
"AbstractFileParser": CsvParser(
format={"filetype": "csv"},
master_schema={
"id": "integer",
"name": "string",
"valid": "boolean",
"code": "integer",
"degrees": "number",
"birthday": "string",
"last_seen": "string",
},
),
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_1.csv"),
"num_records": 8,
"inferred_schema": {
"id": "integer",
"name": "string",
"valid": "boolean",
"code": "integer",
"degrees": "number",
"birthday": "string",
"last_seen": "string",
},
"line_checks": {},
"fails": [],
},
"custom_csv_parameters": {
# tests custom CSV parameters (odd delimiter, quote_char, escape_char & newlines in values in the file)
"AbstractFileParser": CsvParser(
format={"filetype": "csv", "delimiter": "^", "quote_char": "|", "escape_char": "!", "newlines_in_values": True},
master_schema={
"id": "integer",
"name": "string",
"valid": "boolean",
"code": "integer",
"degrees": "number",
"birthday": "string",
"last_seen": "string",
},
),
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_2_params.csv"),
"num_records": 8,
"inferred_schema": {
"id": "integer",
"name": "string",
"valid": "boolean",
"code": "integer",
"degrees": "number",
"birthday": "string",
"last_seen": "string",
},
"line_checks": {},
"fails": [],
},
"encoding_Big5": {
# tests encoding: Big5
"AbstractFileParser": CsvParser(
format={"filetype": "csv", "encoding": "big5"}, master_schema={"id": "integer", "name": "string", "valid": "boolean"}
),
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_3_enc_Big5.csv"),
"num_records": 8,
"inferred_schema": {"id": "integer", "name": "string", "valid": "boolean"},
"line_checks": {
3: {
"id": 3,
"name": "變形金剛,偽裝的機器人",
"valid": False,
}
},
"fails": [],
},
"encoding_Arabic_(Windows 1256)": {
# tests encoding: Arabic (Windows 1256)
"AbstractFileParser": CsvParser(
format={"filetype": "csv", "encoding": "windows-1256"},
master_schema={"id": "integer", "notes": "string", "valid": "boolean"},
),
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_4_enc_Arabic.csv"),
"num_records": 2,
"inferred_schema": {"id": "integer", "notes": "string", "valid": "boolean"},
"line_checks": {
1: {
"id": 1,
"notes": "البايت الجوي هو الأفضل",
"valid": False,
}
},
"fails": [],
},
"compression_gzip": {
# tests compression: gzip
"AbstractFileParser": CsvParser(
format={"filetype": "csv"},
master_schema={
"id": "integer",
"name": "string",
"valid": "boolean",
"code": "integer",
"degrees": "number",
"birthday": "string",
"last_seen": "string",
},
),
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_5.csv.gz"),
"num_records": 8,
"inferred_schema": {
"id": "integer",
"name": "string",
"valid": "boolean",
"code": "integer",
"degrees": "number",
"birthday": "string",
"last_seen": "string",
},
"line_checks": {
7: {
"id": 7,
"name": "xZhh1Kyl",
"valid": False,
"code": 10,
"degrees": -9.2,
"birthday": "2021-07-14",
"last_seen": "2021-07-14 15:30:09.225145",
}
},
"fails": [],
},
"compression_bz2": {
# tests compression: bz2
"AbstractFileParser": CsvParser(
format={"filetype": "csv"},
master_schema={
"id": "integer",
"name": "string",
"valid": "boolean",
"code": "integer",
"degrees": "number",
"birthday": "string",
"last_seen": "string",
},
),
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_7_bz2.csv.bz2"),
"num_records": 8,
"inferred_schema": {
"id": "integer",
"name": "string",
"valid": "boolean",
"code": "integer",
"degrees": "number",
"birthday": "string",
"last_seen": "string",
},
"line_checks": {
7: {
"id": 7,
"name": "xZhh1Kyl",
"valid": False,
"code": 10,
"degrees": -9.2,
"birthday": "2021-07-14",
"last_seen": "2021-07-14 15:30:09.225145",
}
},
"fails": [],
},
"extra_columns_in_master_schema": {
# tests extra columns in master schema
"AbstractFileParser": CsvParser(
format={"filetype": "csv"},
master_schema={
"EXTRA_COLUMN_1": "boolean",
"EXTRA_COLUMN_2": "number",
"id": "integer",
"name": "string",
"valid": "boolean",
"code": "integer",
"degrees": "number",
"birthday": "string",
"last_seen": "string",
},
),
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_1.csv"),
"num_records": 8,
"inferred_schema": {
"id": "integer",
"name": "string",
"valid": "boolean",
"code": "integer",
"degrees": "number",
"birthday": "string",
"last_seen": "string",
},
"line_checks": {},
"fails": [],
},
"missing_columns_in_master_schema": {
# tests missing columns in master schema
# TODO: maybe this should fail read_records, but it does pick up all the columns from file despite missing from master schema
"AbstractFileParser": CsvParser(format={"filetype": "csv"}, master_schema={"id": "integer", "name": "string"}),
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_1.csv"),
"num_records": 8,
"inferred_schema": {
"id": "integer",
"name": "string",
"valid": "boolean",
"code": "integer",
"degrees": "number",
"birthday": "string",
"last_seen": "string",
},
"line_checks": {},
"fails": [],
},
"empty_csv_file": {
# tests empty file, SHOULD FAIL INFER & STREAM RECORDS
"AbstractFileParser": CsvParser(format={"filetype": "csv"}, master_schema={}),
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_6_empty.csv"),
"num_records": 0,
"inferred_schema": {},
"line_checks": {},
"fails": ["test_get_inferred_schema", "test_stream_records"],
},
"empty_advanced_options": {
"AbstractFileParser": CsvParser(
format={"filetype": "csv", "advanced_options": ""},
master_schema={
"id": "integer",
"name": "string",
"valid": "boolean",
"code": "integer",
"degrees": "number",
"birthday": "string",
"last_seen": "string",
},
),
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_1.csv"),
"num_records": 8,
"inferred_schema": {
"id": "integer",
"name": "string",
"valid": "boolean",
"code": "integer",
"degrees": "number",
"birthday": "string",
"last_seen": "string",
},
"line_checks": {},
"fails": [],
},
"no_header_csv_file": {
# no header test
"AbstractFileParser": CsvParser(
format={
"filetype": "csv",
"advanced_options": json.dumps(
{"column_names": ["id", "name", "valid", "code", "degrees", "birthday", "last_seen"]}
),
},
master_schema={},
),
"filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_8_no_header.csv"),
"num_records": 8,
"inferred_schema": {
"id": "integer",
"name": "string",
"valid": "boolean",
"code": "integer",
"degrees": "number",
"birthday": "string",
"last_seen": "string",
},
"line_checks": {},
"fails": [],
},
}
@memory_limit(20)
@pytest.mark.order(1)
def test_big_file(self) -> None:
"""tests a big csv file (>= 1.5G records)"""
filepath = os.path.join(TMP_FOLDER, "big_csv_file." + self.filetype)
schema, file_size = generate_big_file(filepath, 0.1, 123)
expected_count = sum(1 for _ in open(filepath)) - 1
self.logger.info(f"generated file {filepath} with size {file_size}Gb, lines: {expected_count}")
for _ in range(3):
parser = CsvParser(
format={"filetype": self.filetype, "block_size": 5 * 1024**2},
master_schema=schema,
)
expected_file = open(filepath, "r")
# skip the first header line
next(expected_file)
read_count = 0
with smart_open(filepath, self._get_readmode({"AbstractFileParser": parser})) as f:
for record in parser.stream_records(f, FileInfo(key=filepath, size=file_size, last_modified=pendulum.now())):
record_line = ",".join("" if v is None else str(v) for v in record.values())
expected_line = next(expected_file).strip("\n")
assert record_line == expected_line
read_count += 1
assert read_count == expected_count
expected_file.close()