Compare commits

...

4 Commits

Author SHA1 Message Date
Christophe Le Saec
6627d64228 fix(TDI-31777) csv reader correction 2020-02-24 13:50:57 +01:00
Christophe Le Saec
1b7fe995df fix(TDI-31777) - csv comments removed 2020-02-24 09:02:45 +01:00
Christophe Le Saec
5311f83c49 TDI-31777 : correction done 2019-12-19 14:46:26 +01:00
Christophe Le Saec
a60f943f6a TDI-31777 : csv reader to maven 2019-12-19 14:10:05 +01:00
11 changed files with 1000 additions and 566 deletions

View File

@@ -0,0 +1,3 @@
.classpath
.project
target/

View File

@@ -1,564 +0,0 @@
package com.talend.csv;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
public class CSVReader {
private Reader reader;
private char separator = ',';
private char quotechar = '"';
private char escapechar = '"';
private String lineEnd;
private boolean skipEmptyRecords = false;
private boolean trimWhitespace = true;
private static final int BUFFER_SIZE = 4 * 1024;
private static final int FETCH_SIZE = 10 * 50;
private char[] buffer = new char[FETCH_SIZE];
private int currentPosition = 0;
private int bufferCount = 0;
private boolean hasMoreData = true;
private boolean hasNext = false;
private boolean inColumn = false;
private boolean escaping = false;
private char previousChar = '\0';
private String[] values = new String[10];
private HeadersReader headersReader = new HeadersReader();
private int columnCount = 0;
private boolean inQuote = false;
private StringBuilder sb = new StringBuilder(16);
private boolean storeRawRecord = false;
private StringBuilder stringBuilder = new StringBuilder(16 * 10);
private String rawRecord = "";
public CSVReader(String filename,char separator,String charset) throws IOException {
this(new FileInputStream(filename), separator, charset);
}
public CSVReader(InputStream inputStream,char separator,String charset) throws IOException {
this(new UnicodeReader(inputStream, charset), separator);
}
public CSVReader(Reader reader,char separator) {
this.reader = new BufferedReader(reader,BUFFER_SIZE);
this.separator = separator;
}
public static CSVReader parse(String content) {
if (content == null) {
throw new IllegalArgumentException(
"Parameter content can not be null.");
}
return new CSVReader(new StringReader(content),',');
}
public CSVReader setLineEnd(String lineEnd) {
this.lineEnd = lineEnd;
return this;
}
public CSVReader setSeparator(char separator) {
this.separator = separator;
return this;
}
public CSVReader setEscapeChar(char escapechar) {
this.escapechar = escapechar;
return this;
}
public CSVReader setQuoteChar(char quotechar) {
this.quotechar = quotechar;
return this;
}
public char getQuoteChar() {
return this.quotechar;
}
public CSVReader setTrimWhitespace(boolean trimWhitespace) {
this.trimWhitespace = trimWhitespace;
return this;
}
public CSVReader setSkipEmptyRecords(boolean skipEmptyRecords) {
this.skipEmptyRecords = skipEmptyRecords;
return this;
}
public CSVReader setStoreRawRecord(boolean storeRawRecord) {
this.storeRawRecord = storeRawRecord;
return this;
}
public String getRawRecord() {
return rawRecord;
}
public void endRecord() {
hasNext = true;
}
public void endColumn() {
inColumn = false;
String currentValue = sb.toString();
if(trimWhitespace && !inQuote) {
currentValue = trimTail(currentValue);
}
if (columnCount == values.length) {
int newLength = values.length * 2;
String[] holder = new String[newLength];
System.arraycopy(values, 0, holder, 0, values.length);
values = holder;
}
values[columnCount] = currentValue;
columnCount++;
sb.setLength(0);
}
public boolean readNext() throws IOException {
columnCount = 0;
hasNext = false;
rawRecord = "";
if(!hasMoreData) {
return false;
}
while(hasMoreData && !hasNext) {
if(arriveEnd()) {
fill();
continue;
}
char currentChar = buffer[currentPosition];
inQuote = false;
if(quotechar!='\0' && currentChar == quotechar) {//quote char as start of column
inColumn = true;
inQuote = true;
currentPosition++;
escaping = false;
boolean previousCharAsQuote = false;
boolean deleteTrailNoUseChars = false;
if(storeRawRecord) {
stringBuilder.append(currentChar);
}
while(hasMoreData && inColumn) {
if(arriveEnd()) {
fill();
continue;
}
currentChar = buffer[currentPosition];
if(deleteTrailNoUseChars){
if(currentChar == separator) {
endColumn();
if(storeRawRecord) {
stringBuilder.append(currentChar);
}
} else if((lineEnd == null && (currentChar == '\n' || currentChar == '\r'))
|| (lineEnd!=null && currentChar == lineEnd.charAt(0))) {
endColumn();
endRecord();
} else {
if(storeRawRecord) {
stringBuilder.append(currentChar);
}
}
} else if(currentChar == quotechar) {
if(escaping) {//quote char as text
sb.append(currentChar);
escaping = false;
previousCharAsQuote = false;
} else {//quote char as escape or end of column
if(escapechar!='\0' && currentChar == escapechar) {
escaping = true;
}
previousCharAsQuote = true;
}
if(storeRawRecord) {
stringBuilder.append(currentChar);
}
} else if(escapechar!='\0' && escapechar!=quotechar && escaping) {
switch (currentChar) {
case 'n':
sb.append('\n');
break;
case 'r':
sb.append('\r');
break;
case 't':
sb.append('\t');
break;
case 'b':
sb.append('\b');
break;
case 'f':
sb.append('\f');
break;
case 'e':
sb.append('\u001B');
break;
case 'v':
sb.append('\u000B');
break;
case 'a':
sb.append('\u0007');
break;
default :
sb.append(currentChar);
break;
}
escaping = false;
if(storeRawRecord) {
stringBuilder.append(currentChar);
}
} else if(escapechar!='\0' && currentChar == escapechar) {
escaping = true;
if(storeRawRecord) {
stringBuilder.append(currentChar);
}
} else if(previousCharAsQuote) {//quote char as end of column
if(currentChar == separator) {
endColumn();
if(storeRawRecord) {
stringBuilder.append(currentChar);
}
} else if((lineEnd == null && (currentChar == '\n' || currentChar == '\r'))
|| (lineEnd!=null && currentChar == lineEnd.charAt(0))) {
endColumn();
endRecord();
} else {
deleteTrailNoUseChars = true;
if(storeRawRecord) {
stringBuilder.append(currentChar);
}
}
previousCharAsQuote = false;
} else {
sb.append(currentChar);
if(storeRawRecord) {
stringBuilder.append(currentChar);
}
}
previousChar = currentChar;
currentPosition++;
}
} else if(currentChar == separator) {
previousChar = currentChar;
endColumn();
currentPosition++;
if(storeRawRecord) {
stringBuilder.append(currentChar);
}
} else if (lineEnd!=null && currentChar == lineEnd.charAt(0)) {
if (inColumn || columnCount > 0 || !skipEmptyRecords) {
endColumn();
endRecord();
}
currentPosition++;
previousChar = currentChar;
} else if(lineEnd==null && (currentChar == '\r' || currentChar == '\n')) {
if (inColumn || columnCount > 0 || (!skipEmptyRecords && (currentChar == '\r' || previousChar!='\r'))) {
endColumn();
endRecord();
}
currentPosition++;
previousChar = currentChar;
} else if(trimWhitespace && (currentChar == ' ' || currentChar == '\t')) {
inColumn = true;
currentPosition++;
if(storeRawRecord) {
stringBuilder.append(currentChar);
}
} else {
inColumn = true;
escaping = false;
while(hasMoreData && inColumn) {
if(arriveEnd()) {
fill();
continue;
}
currentChar = buffer[currentPosition];
if(quotechar == '\0' && escapechar != '\0' && currentChar == escapechar) {
if(escaping) {
sb.append(currentChar);
escaping = false;
} else {
escaping = true;
}
if(storeRawRecord) {
stringBuilder.append(currentChar);
}
} else if(escapechar!='\0' && escapechar!=quotechar && escaping) {
switch (currentChar) {
case 'n':
sb.append('\n');
break;
case 'r':
sb.append('\r');
break;
case 't':
sb.append('\t');
break;
case 'b':
sb.append('\b');
break;
case 'f':
sb.append('\f');
break;
case 'e':
sb.append('\u001B');
break;
case 'v':
sb.append('\u000B');
break;
case 'a':
sb.append('\u0007');
break;
default :
sb.append(currentChar);
break;
}
escaping = false;
if(storeRawRecord) {
stringBuilder.append(currentChar);
}
} else if(currentChar == separator) {
endColumn();
if(storeRawRecord) {
stringBuilder.append(currentChar);
}
} else if((lineEnd == null && (currentChar == '\n' || currentChar == '\r'))
|| (lineEnd!=null && currentChar == lineEnd.charAt(0))) {
endColumn();
endRecord();
} else {
sb.append(currentChar);
if(storeRawRecord) {
stringBuilder.append(currentChar);
}
}
previousChar = currentChar;
currentPosition++;
}
}
}
if(inColumn || previousChar == separator) {
endColumn();
endRecord();
}
if(storeRawRecord) {
rawRecord = stringBuilder.toString();
stringBuilder.setLength(0);
}
return hasNext;
}
public String get(int index) {
if (index > -1 && index < columnCount) {
return values[index];
} else {
return "";
}
}
public String[] getValues() {
String[] result = new String[columnCount];
System.arraycopy(values, 0, result, 0, columnCount);
return result;
}
private void fill() throws IOException {
int count = reader.read(buffer, 0, buffer.length);
currentPosition = 0;
bufferCount = count;
if(count == -1) {
hasMoreData = false;
}
}
private boolean arriveEnd() {
return currentPosition == bufferCount;
}
private String trimTail(String content) {
int len = content.length();
int newLen = len;
while (newLen > 0) {
char tail = content.charAt(newLen - 1);
if(tail != ' ' && tail != '\t') {
break;
}
newLen--;
}
if(newLen != len) {
content = content.substring(0,newLen);
}
return content;
}
public void close() throws IOException {
reader.close();
headersReader.clear();
}
//Added 20141016 TDQ-9496
public int getCurrentRecord(){
return this.currentPosition;
}
public char getSeperator(){
return separator;
}
/**
* Read the first record of data as the column headers. Added 20141016 TDQ-9496
*
* @return If the header was successfully read or not.
*/
public boolean readHeaders() throws IOException {
boolean result = readNext();
headersReader.length = columnCount;
headersReader.headers = new String[columnCount];
for (int i = 0; i < headersReader.length; i++) {
String columnValue = get(i);
headersReader.headers[i] = columnValue;
headersReader.indexByHeaderName.put(columnValue, new Integer(i));
}
if (result) {
currentPosition--;
}
columnCount = 0;
return result;
}
/**
* Returns the current column value for a given column header name.
*/
public String get(String headerName) throws IOException {
return get(getIndex(headerName));
}
private int getIndex(String headerName) throws IOException {
if(headersReader.indexByHeaderName==null){
return -1;
}
Object indexValue = headersReader.indexByHeaderName.get(headerName);
if (indexValue != null) {
return ((Integer) indexValue).intValue();
} else {
return -1;
}
}
public String[] getHeaders() throws IOException {
if (headersReader.headers == null) {
return null;
} else {
String[] clone = new String[headersReader.length];
System.arraycopy(headersReader.headers, 0, clone, 0,
headersReader.length);
return clone;
}
}
private class HeadersReader {
private String[] headers;
private int length;
private HashMap indexByHeaderName;
public HeadersReader() {
headers = null;
length = 0;
indexByHeaderName = new HashMap();
}
public void clear(){
headers = null;
indexByHeaderName = null;
}
}
/**End of added by TDQ-9496 **/
}

View File

@@ -0,0 +1,73 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.talend.libraries</groupId>
<artifactId>talendcsv</artifactId>
<version>1.0.0</version>
<packaging>jar</packaging>
<name>talend-csv</name>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<talend.nexus.url>https://artifacts-oss.talend.com</talend.nexus.url>
<java.source.version>1.8</java.source.version>
<junit5.version>5.4.2</junit5.version>
</properties>
<distributionManagement>
<snapshotRepository>
<id>talend_nexus_deployment</id>
<url>${talend.nexus.url}/nexus/content/repositories/TalendOpenSourceSnapshot/</url>
<snapshots>
<enabled>true</enabled>
</snapshots>
<releases>
<enabled>false</enabled>
</releases>
</snapshotRepository>
<repository>
<id>talend_nexus_deployment</id>
<url>${talend.nexus.url}/nexus/content/repositories/TalendOpenSourceRelease/</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
<releases>
<enabled>true</enabled>
</releases>
</repository>
</distributionManagement>
<dependencies>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>${junit5.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<version>${junit5.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<source>${java.source.version}</source>
<target>${java.source.version}</target>
<showDeprecation>true</showDeprecation>
<showWarnings>true</showWarnings>
<fork>true</fork>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,97 @@
package com.talend.csv;
public class CSVConfig {
private char separator = ',';
private char quotechar = '"';
private char escapechar = '"';
private String lineEnd = null;
private boolean skipEmptyRecords = false;
private boolean trimWhitespace = true;
public char getSeparator() {
return separator;
}
public void setSeparator(char separator) {
this.separator = separator;
}
public boolean isSeparator(char value) {
return value == this.separator;
}
public boolean isQuoteChar(char value) {
return this.quotechar == value;
}
public char getQuotechar() {
return quotechar;
}
public void setQuotechar(char quotechar) {
this.quotechar = quotechar;
}
public char getEscapechar() {
return escapechar;
}
public void setEscapechar(char escapechar) {
this.escapechar = escapechar;
}
public boolean isEscapechar(char value) {
return this.escapechar != '\0'
&& value == this.escapechar
&& this.escapechar != this.quotechar; // mean no escape char.
}
public boolean isSkipEmptyRecords() {
return skipEmptyRecords;
}
public void setSkipEmptyRecords(boolean skipEmptyRecords) {
this.skipEmptyRecords = skipEmptyRecords;
}
public boolean isTrimWhitespace() {
return trimWhitespace;
}
public void setTrimWhitespace(boolean trimWhitespace) {
this.trimWhitespace = trimWhitespace;
}
public boolean isLineEnd(char elem, int pos) {
if (this.lineEnd == null) {
if (pos == 0) {
return elem == '\n' || elem == '\r';
}
if (pos == 1) {
return elem == '\n';
}
return false;
}
if (pos >= this.lineEnd.length()) {
return false;
}
return this.lineEnd.charAt(pos) == elem;
}
public boolean isLineSep(String token) {
if (this.lineEnd == null) {
return "\n".equals(token) || "\r\n".equals(token);
}
return this.lineEnd.equals(token);
}
public void setLineEnd(String lineEnd) {
this.lineEnd = lineEnd;
}
}

View File

@@ -0,0 +1,516 @@
package com.talend.csv;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.function.Consumer;
public class CSVReader implements AutoCloseable {
private final Source source;
private final CSVConfig config = new CSVConfig();
private boolean hasNext = false;
private String[] values = new String[10];
private HeadersReader headersReader = new HeadersReader();
private int columnCount = 0;
private boolean storeRawRecord = false;
private String rawRecord = "";
public CSVReader(String filename,char separator,String charset) throws IOException {
this(new FileInputStream(filename), separator, charset);
}
public CSVReader(InputStream inputStream,char separator,String charset) throws IOException {
this(new UnicodeReader(inputStream, charset), separator);
}
public CSVReader(Reader reader,char separator) {
this.source = new Source(reader);
this.config.setSeparator(separator);
}
public static CSVReader parse(String content) {
if (content == null) {
throw new IllegalArgumentException(
"Parameter content can not be null.");
}
return new CSVReader(new StringReader(content),',');
}
public CSVReader setLineEnd(String lineEnd) {
this.config.setLineEnd(lineEnd);
return this;
}
public CSVReader setSeparator(char separator) {
this.config.setSeparator(separator);
return this;
}
public CSVReader setEscapeChar(char escapechar) {
this.config.setEscapechar(escapechar);
return this;
}
public CSVReader setQuoteChar(char quotechar) {
this.config.setQuotechar(quotechar);
return this;
}
public char getQuoteChar() {
return this.config.getQuotechar();
}
public CSVReader setTrimWhitespace(boolean trimWhitespace) {
this.config.setTrimWhitespace(trimWhitespace);
return this;
}
public CSVReader setSkipEmptyRecords(boolean skipEmptyRecords) {
this.config.setSkipEmptyRecords(skipEmptyRecords);
return this;
}
public CSVReader setStoreRawRecord(boolean storeRawRecord) {
this.storeRawRecord = storeRawRecord;
return this;
}
public String getRawRecord() {
return rawRecord;
}
private State state = new StartState(null);
private void toRecord(List<String> fields) {
this.values = fields.toArray(new String[fields.size()]);
this.hasNext = true;
}
private CSVReader.ResultAction result = new CSVReader.ResultAction(this::toRecord);
public boolean readNext() throws IOException {
hasNext = false;
if (!this.source.hasMoreData()) {
return false;
}
while(this.source.hasMoreData() && !hasNext) {
char currentChar = this.source.currentChar();
this.state = this.state.accept(currentChar, this.config, result);
this.source.next();
}
if (!this.source.hasMoreData()) {
this.state = this.state.accept('\0', this.config, result); // end of file.
}
return hasNext;
}
public String get(int index) {
if (index > -1 && index < columnCount) {
return values[index];
} else {
return "";
}
}
public String[] getValues() {
String[] result = new String[values.length];
System.arraycopy(values, 0, result, 0, values.length);
return result;
}
@Override
public void close() throws IOException {
this.source.close();
headersReader.clear();
}
//Added 20141016 TDQ-9496
public int getCurrentRecord(){
return this.source.getCurrentPosition();
}
public char getSeperator(){
return this.config.getSeparator();
}
/**
* Read the first record of data as the column headers. Added 20141016 TDQ-9496
*
* @return If the header was successfully read or not.
*/
public boolean readHeaders() throws IOException {
boolean result = readNext();
columnCount = this.values.length;
headersReader.length = columnCount;
headersReader.headers = new String[columnCount];
for (int i = 0; i < headersReader.length; i++) {
String columnValue = get(i);
headersReader.headers[i] = columnValue;
headersReader.indexByHeaderName.put(columnValue, new Integer(i));
}
if (result) {
this.source.decreaseCurrentPosition();
}
columnCount = 0;
return result;
}
/**
* Returns the current column value for a given column header name.
*/
public String get(String headerName) throws IOException {
return get(getIndex(headerName));
}
private int getIndex(String headerName) throws IOException {
if(headersReader.indexByHeaderName==null){
return -1;
}
Object indexValue = headersReader.indexByHeaderName.get(headerName);
if (indexValue != null) {
return ((Integer) indexValue).intValue();
} else {
return -1;
}
}
public String[] getHeaders() throws IOException {
if (headersReader.headers == null) {
return null;
} else {
String[] clone = new String[headersReader.length];
System.arraycopy(headersReader.headers, 0, clone, 0,
headersReader.length);
return clone;
}
}
private class HeadersReader {
private String[] headers;
private int length;
private HashMap indexByHeaderName;
public HeadersReader() {
headers = null;
length = 0;
indexByHeaderName = new HashMap();
}
public void clear(){
headers = null;
indexByHeaderName = null;
}
}
/**End of added by TDQ-9496 **/
static class ResultAction {
private final List<String> fields = new ArrayList<>();
private final StringBuilder field = new StringBuilder();
private final Consumer<List<String>> recordConsumer;
private boolean doTrimTail;
public ResultAction(Consumer<List<String>> recordConsumer) {
this.recordConsumer = recordConsumer;
}
public void addToCurrentField(char c) {
this.field.append(c);
}
public void addToCurrentField(String c) {
this.field.append(c);
}
public void endField() {
if (this.doTrimTail) {
this.trimTail();
}
this.fields.add(this.field.toString());
this.field.setLength(0);
}
public void endRecord(boolean skipEmpty) {
if (!skipEmpty || this.fields.size() > 0) {
this.recordConsumer.accept(this.fields);
}
this.fields.clear();
}
public void setDoTrimTail(boolean doTrimTail) {
this.doTrimTail = doTrimTail;
}
public List<String> getFields() {
return fields;
}
private void trimTail() {
boolean doTrim = true;
while (doTrim) {
doTrim = this.field.length() > 0;
if (doTrim) {
char lastChar = this.field.charAt(this.field.length() - 1);
doTrim = lastChar == ' ' || lastChar == '\t';
}
if (doTrim) {
this.field.setLength(this.field.length() - 1);
}
}
}
}
static abstract class State {
protected final State preceding;
public State(State preceding) {
this.preceding = preceding;
}
public State backToStart() {
// back to start.
State prec = this.preceding;
while (prec != null
&& !(StartState.class.isInstance(prec))
&& prec.preceding != null) {
prec = prec.preceding;
}
return prec;
}
public abstract State accept(char newChar, CSVConfig config, ResultAction action);
}
static class EscapeState extends State {
public EscapeState(State preceding) {
super(preceding);
}
@Override
public State accept(char currentChar, CSVConfig config, ResultAction action) {
char real = currentChar;
switch (currentChar) {
case 'n':
real = '\n';
break;
case 'r':
real = '\r';
break;
case 't':
real = '\t';
break;
case 'b':
real = '\b';
break;
case 'f':
real = '\f';
break;
case 'e':
real = '\u001B';
break;
case 'v':
real = '\u000B';
break;
case 'a':
real = '\u0007';
break;
default:
break;
}
action.addToCurrentField(real);
return this.preceding;
}
}
static class StartState extends State {
public StartState(State preceding) {
super(preceding);
}
@Override
public State accept(char newChar, CSVConfig config, ResultAction action) {
if ((newChar == '\t' || newChar == ' ') && config.isTrimWhitespace()) {
return this;
}
if (newChar == '\0') {
return this;
}
if (newChar == config.getQuotechar()) {
return new QuotedFieldState(this);
}
if (config.isSeparator(newChar)) {
action.setDoTrimTail(config.isTrimWhitespace());
action.endField();
return this;
}
if (config.isLineEnd(newChar, 0)) {
EndLineState state = new EndLineState(this);
return state.accept(newChar, config, action);
}
UnQuotedFieldState nextStep = new UnQuotedFieldState(this);
nextStep.accept(newChar, config, action);
return nextStep;
}
}
static class QuotedFieldState extends State {
private final StringBuilder next = new StringBuilder();
private boolean quoteClosed = false;
public QuotedFieldState(State preceding) {
super(preceding);
}
@Override
public State accept(char newChar, CSVConfig config, ResultAction action) {
action.setDoTrimTail(false);
if (config.isQuoteChar(newChar)) {
if (!quoteClosed) {
quoteClosed = true;
this.next.append(newChar);
} else if (config.isEscapechar(newChar)) { // double quote and quote is also escape char.
quoteClosed = false;
this.next.append(newChar);
}
return this;
}
if (!quoteClosed) {
if (config.isEscapechar(newChar)) {
return new EscapeState(this);
}
action.addToCurrentField(newChar);
return this;
}
if (newChar == '\0') {
next.setLength(0);
action.setDoTrimTail(false);
action.endField();
action.endRecord(config.isSkipEmptyRecords());
return this.preceding;
}
if (newChar == ' ' || newChar == '\t') {
this.next.append(newChar);
return this;
}
if (config.isSeparator(newChar)) {
next.setLength(0);
action.endField();
return this.preceding;
}
if (config.isLineEnd(newChar, 0)) {
next.setLength(0);
quoteClosed = false;
action.setDoTrimTail(false);
EndLineState state = new EndLineState(this);
return state.accept(newChar, config, action);
}
// field continue
action.addToCurrentField(next.toString());
action.addToCurrentField(newChar);
next.setLength(0);
quoteClosed = false;
return this;
}
}
static class EndLineState extends State {
private int pos = 0;
private final StringBuilder builder = new StringBuilder(4);
public EndLineState(State preceding) {
super(preceding);
}
@Override
public State accept(char newChar, CSVConfig config, ResultAction action) {
// end of line continue
this.builder.append(newChar);
if (config.isLineSep(this.builder.toString())) {
// end of line complete
action.endField();
action.endRecord(config.isSkipEmptyRecords());
return this.backToStart();
}
if (config.isLineEnd(newChar, pos)) {
this.pos++;
return this;
}
// not end of line.
action.addToCurrentField(builder.toString());
this.pos = 0;
this.builder.setLength(0);
return this.preceding;
}
}
static class UnQuotedFieldState extends State {
public UnQuotedFieldState(State preceding) {
super(preceding);
}
@Override
public State accept(char newChar, CSVConfig config, ResultAction action) {
action.setDoTrimTail(config.isTrimWhitespace());
if (config.isSeparator(newChar)) {
action.endField();
return this.preceding;
}
if (newChar == '\0') {
action.endField();
action.endRecord(config.isSkipEmptyRecords());
return this.preceding;
}
if (config.isLineEnd(newChar, 0)) {
EndLineState state = new EndLineState(this);
return state.accept(newChar, config, action);
}
action.addToCurrentField(newChar);
return this;
}
}
}

View File

@@ -0,0 +1,85 @@
package com.talend.csv;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
public class Source implements AutoCloseable {
private static final int FETCH_SIZE = 10 * 50;
private static final int BUFFER_SIZE = 4 * 1024;
private char[] buffer = new char[FETCH_SIZE];
private int currentPosition = 0;
private int bufferCount = 0;
private boolean hasMoreData = true;
private final Reader reader;
private char previousChar = '\0';
public Source(Reader reader) {
if (!(reader instanceof BufferedReader)) {
this.reader = new BufferedReader(reader, BUFFER_SIZE);
}
else {
this.reader = reader;
}
}
@Override
public void close() throws IOException {
this.reader.close();
}
public char currentChar() throws IOException {
if (this.currentPosition >= this.buffer.length || bufferCount == 0) {
this.fill();
if (!this.hasMoreData) {
throw new IOException("Has no more data.");
}
}
return this.buffer[ this.currentPosition ];
}
public char previousChar() throws IOException {
return this.previousChar;
}
public boolean next() throws IOException {
this.previousChar = this.currentChar();
this.currentPosition++;
if (this.currentPosition >= this.buffer.length || bufferCount == 0) {
this.fill();
}
if (this.currentPosition >= this.bufferCount) {
this.hasMoreData = false;
}
return this.hasMoreData;
}
public boolean hasMoreData() {
return hasMoreData;
}
public int getCurrentPosition() {
return currentPosition;
}
public void decreaseCurrentPosition() {
this.currentPosition--;
}
private void fill() throws IOException {
int count = reader.read(buffer, 0, buffer.length);
currentPosition = 0;
bufferCount = count;
if(count == -1) {
hasMoreData = false;
}
}
}

View File

@@ -0,0 +1,77 @@
package com.talend.csv;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.util.ArrayList;
import java.util.List;
class CSVReaderStateTest {
private static final CSVConfig config = new CSVConfig();
@BeforeAll
public static void init() {
config.setSeparator(',');
config.setEscapechar('\\');
config.setQuotechar('"');
}
@Test
public void quotedField() {
checkField(new CSVReader.QuotedFieldState(null), "Hello\",", "Hello");
checkField(new CSVReader.QuotedFieldState(null), "He\\nllo\",", "He\nllo");
checkField(new CSVReader.QuotedFieldState(null), "Hello\"toto\",", "Hello\"toto");
checkField(new CSVReader.QuotedFieldState(null), "Hello\" toto\" ,", "Hello\" toto");
}
@Test
public void outsideField() {
CSVReader.State state = new CSVReader.StartState(null);
String source = "Hello,\"World\" , next \n Nex,\"World \t\" , ne\txt \n";
final List<List<String>> records = new ArrayList<>();
CSVReader.ResultAction action = new CSVReader.ResultAction((List<String> f) -> {
records.add(new ArrayList<>(f));
});
state = this.accept(state, source, action);
Assertions.assertEquals(2, records.size());
List<String> rec1 = records.get(0);
Assertions.assertEquals(3, rec1.size());
Assertions.assertEquals("Hello", rec1.get(0));
Assertions.assertEquals("World", rec1.get(1));
Assertions.assertEquals("next", rec1.get(2));
List<String> rec2 = records.get(1);
Assertions.assertEquals(3, rec2.size());
Assertions.assertEquals("Nex", rec2.get(0));
Assertions.assertEquals("World \t", rec2.get(1));
Assertions.assertEquals("ne\txt", rec2.get(2));
}
private void checkField(CSVReader.State state, String from, String to) {
CSVReader.ResultAction action = new CSVReader.ResultAction(null);
state = this.accept(state, from, action);
List<String> fields = action.getFields();
Assertions.assertEquals(1, fields.size());
Assertions.assertEquals(to, fields.get(0));
Assertions.assertNull(state);
}
private CSVReader.State accept(CSVReader.State state, String value, CSVReader.ResultAction action) {
for (int i = 0; i < value.length(); i++) {
CSVReader.State state2 = state.accept(value.charAt(i), config, action);
state = state2;
}
return state;
}
}

View File

@@ -0,0 +1,147 @@
package com.talend.csv;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Assumptions;
import org.junit.jupiter.api.Test;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringReader;
import static org.junit.jupiter.api.Assertions.*;
class CSVReaderTest {
@Test
void readNext() throws IOException {
String lines = "\"event_id\",\"event_name\",\"event_value\",\"source\"\n" + // titles.
"\"001\",\"CN\",\"This is some \\\\ttext\",\"event_200\"\n" + // normal line
"\"002\",\"CN\",\"This is some text, with sep\",\"event_250\"\n" + // test field sep inside value
"\"003\",\"CN\",\"This is some \\\"text\\\" inside value\",\"event_300\"\n" + // escape quote inside value
"\"004\",\"CN\",\"This is some other \"text\" inside value\",\"event_400\"\n" + // unescape quote inside value
"\"005\" , \"CN\" , \"This is some text\" , event_500\n" + // spaced field
"006, CN , \"Text\" ,\" xx \" \n" + // unquoted fields.
"007,,\"\",\" xx \" "; // empty record.
final CSVReader reader = new CSVReader(new StringReader(lines), ',');
reader.setEscapeChar('\\').setStoreRawRecord(true).setTrimWhitespace(false);
boolean headers1 = reader.readHeaders();
final String[] headers = reader.getHeaders();
Assertions.assertAll(
() -> Assertions.assertEquals(4, headers.length),
() -> Assertions.assertEquals("event_id", headers[0]),
() -> Assertions.assertEquals("event_name", headers[1]),
() -> Assertions.assertEquals("event_value", headers[2]),
() -> Assertions.assertEquals("source", headers[3])
);
Assertions.assertTrue(reader.readNext());
Assertions.assertAll(
() -> this.checkNextValues("normal", reader, "001", "CN", "This is some \\ttext", "event_200"),
() -> this.checkNextValues("field sep in value", reader, "002", "CN", "This is some text, with sep", "event_250"),
() -> this.checkNextValues("escape quote inside value", reader, "003", "CN", "This is some \"text\" inside value", "event_300"),
() -> this.checkNextValues("unescape quote inside value", reader, "004", "CN", "This is some other \"text\" inside value", "event_400"),
() -> this.checkNextValues("spaced field", reader, "005", " \"CN\" "," \"This is some text\" ", " event_500"),
() -> this.checkNextValues("unquoted fields", reader, "006", " CN "," \"Text\" ", " xx "),
() -> this.checkNextValues("empty record", reader, "007", "", "", " xx ")
);
Assertions.assertFalse(reader.readNext());
final CSVReader reader2 = new CSVReader(new StringReader(lines), ',');
reader2.setEscapeChar('\\').setStoreRawRecord(true).setTrimWhitespace(true);
reader2.readHeaders();
reader2.getHeaders();
Assertions.assertTrue(reader2.readNext());
Assertions.assertAll(
() -> this.checkNextValues("normal 2", reader2, "001", "CN", "This is some \\ttext", "event_200"),
() -> this.checkNextValues("field sep in value 2", reader2, "002", "CN", "This is some text, with sep", "event_250"),
() -> this.checkNextValues("escape quote inside value 2", reader2, "003", "CN", "This is some \"text\" inside value", "event_300"),
() -> this.checkNextValues("unescape quote inside value", reader2, "004", "CN", "This is some other \"text\" inside value", "event_400"),
() -> this.checkNextValues("spaced field 2", reader2, "005", "CN","This is some text", "event_500"),
() -> this.checkNextValues("unquoted fields 2", reader2, "006", "CN","Text", " xx "),
() -> this.checkNextValues("empty record 2", reader2, "007", "", "", " xx ")
);
Assertions.assertFalse(reader2.readNext());
}
@Test
void readNextEmptyRecord() throws IOException {
String line = "0\\t07,, \"\" ,\" x\\tx \" ";
final CSVReader reader = new CSVReader(new StringReader(line), ',');
reader.setEscapeChar('\\');
reader.setTrimWhitespace(true);
reader.setSkipEmptyRecords(true);
Assertions.assertAll(
() -> this.checkNextValues("empty record", reader, "0\\t07", "", "", " x\tx ")
);
}
@Test
void lineSepTest() throws IOException {
String lines = "line@1@#line#2";
final CSVReader reader = new CSVReader(new StringReader(lines), ',');
reader.setLineEnd("@#");
Assertions.assertAll(
() -> checkNextValues("line 1 for line sep", reader, "line@1"),
() -> checkNextValues("line 2 for line sep", reader, "line#2")
);
String lines2 = "Hello@#World@#With@butoneline@#With#butoneline\n";
final CSVReader reader1 = new CSVReader(new StringReader(lines2), ',');
reader1.setLineEnd("@#");
Assertions.assertAll(
() -> checkNextValues("line 1 for line sep", reader1, "Hello"),
() -> checkNextValues("line 2 for line sep", reader1, "World"),
() -> checkNextValues("line 3 for line sep", reader1, "With@butoneline"),
() -> checkNextValues("line 4 for line sep", reader1, "With#butoneline\n")
);
}
@Test
void testEscapeIsQuote() throws IOException {
String lines = "\"L\"\"in\"te 1\"\nLine\"t\"\"2";
final CSVReader reader = new CSVReader(new StringReader(lines), ',');
Assertions.assertAll(
() -> checkNextValues("line 1", reader, "L\"in\"te 1"),
() -> checkNextValues("line 2", reader, "Line\"t\"\"2")
);
}
@Test
void testQuoted() throws IOException {
String input = "\"Hello\",\"ss\"\n\"World\",\"ddzs\"\n\"OneColumn\",\"ddzs\"\n";
File fic = new File("/home/clesaec/project/jobs/csvConv/oneCol.txt");
final CSVReader reader = new CSVReader(new StringReader(input), ',');
//final CSVReader reader = new CSVReader(new FileInputStream(fic), ',', "ISO-8859-15");
reader.setQuoteChar('"');
reader.setTrimWhitespace(false);
reader.setEscapeChar('"');
reader.setSkipEmptyRecords(false);
Assertions.assertAll(
() -> checkNextValues("line 1", reader, "Hello", "ss"),
() -> checkNextValues("line 2", reader, "World", "ddzs"),
() -> checkNextValues("line 3", reader, "OneColumn", "ddzs")
);
Assertions.assertFalse(reader.readNext());
}
void checkNextValues(String comment, CSVReader reader, String... excepted) throws IOException {
Assertions.assertTrue(reader.readNext());
String[] values = reader.getValues();
Assertions.assertEquals(excepted.length, values.length, comment + " : wrong length");
for (int i = 0; i < excepted.length; i++) {
Assertions.assertEquals(excepted[i], values[i], comment + " : field " + i + " in error");
}
}
}

View File

@@ -1127,7 +1127,7 @@
csvReader<%=cid %>.setTrimWhitespace(false);
if ( (rowSeparator_<%=cid %>[0] != '\n') && (rowSeparator_<%=cid %>[0] != '\r') )
csvReader<%=cid %>.setLineEnd(""+rowSeparator_<%=cid %>[0]);
csvReader<%=cid %>.setLineEnd(new String(rowSeparator_<%=cid %>));
<%
if(("").equals(textEnclosure1) || textEnclosure1.startsWith("\"")){//normal situation
%>
@@ -1228,7 +1228,7 @@
}
csvReader<%=cid %>.setTrimWhitespace(false);
if ( (rowSeparator_<%=cid %>[0] != '\n') && (rowSeparator_<%=cid %>[0] != '\r') )
csvReader<%=cid %>.setLineEnd(""+rowSeparator_<%=cid %>[0]);
csvReader<%=cid %>.setLineEnd(new String(rowSeparator_<%=cid %>));
<%
if(("").equals(textEnclosure1) || textEnclosure1.startsWith("\"")){//normal situation
%>