Introduce new split function to handle direct byte buffers and apply encoding rules to handle mixed encoded record (ex: ISO-8859-15 + UTF-8)

Conflicts:
	main/plugins/org.talend.librariesmanager/resources/java/routines/system/StringUtils.java
This commit is contained in:
ftom
2014-11-12 13:29:54 +01:00
parent aed43b5320
commit 7cb6a7e447

View File

@@ -12,6 +12,13 @@
// ============================================================================
package routines.system;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
@@ -22,14 +29,111 @@ public class StringUtils {
public static final String[] EMPTY_STRING_ARRAY = new String[0];
public static final String EMPTY = "";
public static String newStringFromSplit(CharsetDecoder decoder, CharsetDecoder utf8Decoder, String encoding,
byte[] fieldBytes, int length) {
ByteBuffer fieldBuf = ByteBuffer.wrap(fieldBytes, 0, length);
CharBuffer fieldCharBuf = CharBuffer.allocate(length);
utf8Decoder.reset();
CoderResult res = utf8Decoder.decode(fieldBuf, fieldCharBuf, true);
if (res.isError() && decoder != null) {
decoder.reset();
res = decoder.decode(fieldBuf, fieldCharBuf, true);
if (!res.isError()) {
decoder.flush(fieldCharBuf);
return new String(fieldCharBuf.array());
}
} else {
utf8Decoder.flush(fieldCharBuf);
return new String(fieldCharBuf.array());
}
return "";
}
public static String[] splitNotRegexWithEncoding(byte[] bline, String encoding, String separatorChars)
throws UnsupportedEncodingException {
if (bline == null) {
return null;
}
ByteBuffer line = ByteBuffer.wrap(bline);
byte[] sep = null;
CharsetDecoder decoder = null;
if (encoding != null) {
sep = separatorChars.getBytes(encoding);
decoder = Charset.forName(encoding).newDecoder();
decoder.onMalformedInput(CodingErrorAction.REPORT);
decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
} else {
sep = separatorChars.getBytes();
}
if (sep.length == 0) {
String[] result = new String[1];
result[0] = new String(bline, encoding);
return result;
}
CharsetDecoder utf8Decoder = Charset.forName("UTF-8").newDecoder(); //$NON-NLS-1$
utf8Decoder.onMalformedInput(CodingErrorAction.REPORT);
utf8Decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
ArrayList<String> substrings = new ArrayList<String>();
int lineLength = line.limit();
int sepCursor = 0;
int fieldCursor = 0;
byte[] fieldBytes = new byte[lineLength];
while (line.position() < line.limit()) {
if (sepCursor < sep.length) {
byte currentByte = line.get();
if (currentByte == sep[sepCursor]) {
sepCursor++;
} else {
sepCursor = 0;
fieldBytes[fieldCursor++] = currentByte;
}
} else {
// we found a new field
if (fieldCursor > 0) {
substrings.add(newStringFromSplit(decoder, utf8Decoder, encoding, fieldBytes, fieldCursor));
fieldCursor = 0;
} else {
// empty field
substrings.add(""); //$NON-NLS-1$
}
sepCursor = 0;
}
}
if (fieldCursor > 0) {
substrings.add(newStringFromSplit(decoder, utf8Decoder, encoding, fieldBytes, fieldCursor));
}
if (sepCursor == sep.length) {
substrings.add(""); //$NON-NLS-1$
}
int resultSize = substrings.size();
if (resultSize == 0) {
// no delimiter found so we have only one column
String[] result = new String[1];
result[0] = new String(bline, encoding);
return result;
}
String[] result = new String[resultSize];
substrings.toArray(result);
return result;
}
/**
* replace the method : String.split(String regex)
*
* @param str
* @param separatorChars
* @return
*/
public static String[] splitNotRegex(String str,String separatorChars) {
public static String[] splitNotRegex(String str, String separatorChars) {
if (str == null) {
return null;
}
@@ -63,7 +167,7 @@ public class StringUtils {
}
int resultSize = substrings.size();
while (resultSize > 0 && substrings.get(resultSize-1).equals("")) {
while (resultSize > 0 && substrings.get(resultSize - 1).equals("")) {
resultSize--;
}
String[] result = new String[resultSize];
@@ -236,7 +340,7 @@ public class StringUtils {
} else {
// case 3:
if (replacement == null) {
if ( (caseSensitive && src.equals(search)) || (!caseSensitive && src.equalsIgnoreCase(search))) {
if ((caseSensitive && src.equals(search)) || (!caseSensitive && src.equalsIgnoreCase(search))) {
// regex != null && src != null && replacement != null, and match the whole src
return replacement;
} else {
@@ -305,8 +409,8 @@ public class StringUtils {
/**
* return null value not "null" String when obj is null
* that is the only difference with String.valueOf(Object obj)
* return null value not "null" String when obj is null that is the only difference with String.valueOf(Object obj)
*
* @param obj
* @return
*/