TBD-1238
Introduce new split function to handle direct byte buffers and apply encoding rules to handle mixed encoded record (ex: ISO-8859-15 + UTF-8) Conflicts: main/plugins/org.talend.librariesmanager/resources/java/routines/system/StringUtils.java
This commit is contained in:
@@ -12,6 +12,13 @@
|
||||
// ============================================================================
|
||||
package routines.system;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
@@ -22,14 +29,111 @@ public class StringUtils {
|
||||
public static final String[] EMPTY_STRING_ARRAY = new String[0];
|
||||
|
||||
public static final String EMPTY = "";
|
||||
|
||||
public static String newStringFromSplit(CharsetDecoder decoder, CharsetDecoder utf8Decoder, String encoding,
|
||||
byte[] fieldBytes, int length) {
|
||||
ByteBuffer fieldBuf = ByteBuffer.wrap(fieldBytes, 0, length);
|
||||
CharBuffer fieldCharBuf = CharBuffer.allocate(length);
|
||||
utf8Decoder.reset();
|
||||
CoderResult res = utf8Decoder.decode(fieldBuf, fieldCharBuf, true);
|
||||
if (res.isError() && decoder != null) {
|
||||
decoder.reset();
|
||||
res = decoder.decode(fieldBuf, fieldCharBuf, true);
|
||||
if (!res.isError()) {
|
||||
decoder.flush(fieldCharBuf);
|
||||
return new String(fieldCharBuf.array());
|
||||
}
|
||||
} else {
|
||||
utf8Decoder.flush(fieldCharBuf);
|
||||
return new String(fieldCharBuf.array());
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
public static String[] splitNotRegexWithEncoding(byte[] bline, String encoding, String separatorChars)
|
||||
throws UnsupportedEncodingException {
|
||||
if (bline == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
ByteBuffer line = ByteBuffer.wrap(bline);
|
||||
|
||||
byte[] sep = null;
|
||||
CharsetDecoder decoder = null;
|
||||
if (encoding != null) {
|
||||
sep = separatorChars.getBytes(encoding);
|
||||
|
||||
decoder = Charset.forName(encoding).newDecoder();
|
||||
decoder.onMalformedInput(CodingErrorAction.REPORT);
|
||||
decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||
} else {
|
||||
sep = separatorChars.getBytes();
|
||||
}
|
||||
|
||||
if (sep.length == 0) {
|
||||
String[] result = new String[1];
|
||||
result[0] = new String(bline, encoding);
|
||||
return result;
|
||||
}
|
||||
|
||||
CharsetDecoder utf8Decoder = Charset.forName("UTF-8").newDecoder(); //$NON-NLS-1$
|
||||
utf8Decoder.onMalformedInput(CodingErrorAction.REPORT);
|
||||
utf8Decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||
|
||||
ArrayList<String> substrings = new ArrayList<String>();
|
||||
|
||||
int lineLength = line.limit();
|
||||
int sepCursor = 0;
|
||||
int fieldCursor = 0;
|
||||
byte[] fieldBytes = new byte[lineLength];
|
||||
while (line.position() < line.limit()) {
|
||||
if (sepCursor < sep.length) {
|
||||
byte currentByte = line.get();
|
||||
if (currentByte == sep[sepCursor]) {
|
||||
sepCursor++;
|
||||
} else {
|
||||
sepCursor = 0;
|
||||
fieldBytes[fieldCursor++] = currentByte;
|
||||
}
|
||||
} else {
|
||||
// we found a new field
|
||||
if (fieldCursor > 0) {
|
||||
substrings.add(newStringFromSplit(decoder, utf8Decoder, encoding, fieldBytes, fieldCursor));
|
||||
fieldCursor = 0;
|
||||
} else {
|
||||
// empty field
|
||||
substrings.add(""); //$NON-NLS-1$
|
||||
}
|
||||
sepCursor = 0;
|
||||
}
|
||||
}
|
||||
if (fieldCursor > 0) {
|
||||
substrings.add(newStringFromSplit(decoder, utf8Decoder, encoding, fieldBytes, fieldCursor));
|
||||
}
|
||||
if (sepCursor == sep.length) {
|
||||
substrings.add(""); //$NON-NLS-1$
|
||||
}
|
||||
|
||||
int resultSize = substrings.size();
|
||||
if (resultSize == 0) {
|
||||
// no delimiter found so we have only one column
|
||||
String[] result = new String[1];
|
||||
result[0] = new String(bline, encoding);
|
||||
return result;
|
||||
}
|
||||
String[] result = new String[resultSize];
|
||||
substrings.toArray(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* replace the method : String.split(String regex)
|
||||
*
|
||||
* @param str
|
||||
* @param separatorChars
|
||||
* @return
|
||||
*/
|
||||
public static String[] splitNotRegex(String str,String separatorChars) {
|
||||
public static String[] splitNotRegex(String str, String separatorChars) {
|
||||
if (str == null) {
|
||||
return null;
|
||||
}
|
||||
@@ -63,7 +167,7 @@ public class StringUtils {
|
||||
}
|
||||
|
||||
int resultSize = substrings.size();
|
||||
while (resultSize > 0 && substrings.get(resultSize-1).equals("")) {
|
||||
while (resultSize > 0 && substrings.get(resultSize - 1).equals("")) {
|
||||
resultSize--;
|
||||
}
|
||||
String[] result = new String[resultSize];
|
||||
@@ -236,7 +340,7 @@ public class StringUtils {
|
||||
} else {
|
||||
// case 3:
|
||||
if (replacement == null) {
|
||||
if ( (caseSensitive && src.equals(search)) || (!caseSensitive && src.equalsIgnoreCase(search))) {
|
||||
if ((caseSensitive && src.equals(search)) || (!caseSensitive && src.equalsIgnoreCase(search))) {
|
||||
// regex != null && src != null && replacement != null, and match the whole src
|
||||
return replacement;
|
||||
} else {
|
||||
@@ -305,8 +409,8 @@ public class StringUtils {
|
||||
|
||||
|
||||
/**
|
||||
* return null value not "null" String when obj is null
|
||||
* that is the only difference with String.valueOf(Object obj)
|
||||
* return null value not "null" String when obj is null that is the only difference with String.valueOf(Object obj)
|
||||
*
|
||||
* @param obj
|
||||
* @return
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user