TBD-1238

Introduce new split function to handle direct byte buffers and apply encoding rules to handle mixed encoded record (ex: ISO-8859-15 + UTF-8) Conflicts: main/plugins/org.talend.librariesmanager/resources/java/routines/system/StringUtils.java
2014-11-12 13:29:54 +01:00
parent aed43b5320
commit 7cb6a7e447
1 changed files with 109 additions and 5 deletions
--- a/main/plugins/org.talend.librariesmanager/resources/java/routines/system/StringUtils.java
+++ b/main/plugins/org.talend.librariesmanager/resources/java/routines/system/StringUtils.java
@@ -12,6 +12,13 @@
 // ============================================================================
 package routines.system;

+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.regex.Matcher;
@@ -22,14 +29,111 @@ public class StringUtils {
 	public static final String[] EMPTY_STRING_ARRAY = new String[0];
 	
 	public static final String EMPTY = "";
+
+    public static String newStringFromSplit(CharsetDecoder decoder, CharsetDecoder utf8Decoder, String encoding,
+            byte[] fieldBytes, int length) {
+        ByteBuffer fieldBuf = ByteBuffer.wrap(fieldBytes, 0, length);
+        CharBuffer fieldCharBuf = CharBuffer.allocate(length);
+        utf8Decoder.reset();
+        CoderResult res = utf8Decoder.decode(fieldBuf, fieldCharBuf, true);
+        if (res.isError() && decoder != null) {
+            decoder.reset();
+            res = decoder.decode(fieldBuf, fieldCharBuf, true);
+            if (!res.isError()) {
+                decoder.flush(fieldCharBuf);
+                return new String(fieldCharBuf.array());
+            }
+        } else {
+            utf8Decoder.flush(fieldCharBuf);
+            return new String(fieldCharBuf.array());
+        }
+        return "";
+    }
+
+    public static String[] splitNotRegexWithEncoding(byte[] bline, String encoding, String separatorChars)
+            throws UnsupportedEncodingException {
+        if (bline == null) {
+            return null;
+        }
+
+        ByteBuffer line = ByteBuffer.wrap(bline);
+
+        byte[] sep = null;
+        CharsetDecoder decoder = null;
+        if (encoding != null) {
+            sep = separatorChars.getBytes(encoding);
+
+            decoder = Charset.forName(encoding).newDecoder();
+            decoder.onMalformedInput(CodingErrorAction.REPORT);
+            decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
+        } else {
+            sep = separatorChars.getBytes();
+        }
+
+        if (sep.length == 0) {
+            String[] result = new String[1];
+            result[0] = new String(bline, encoding);
+            return result;
+        }
+
+        CharsetDecoder utf8Decoder = Charset.forName("UTF-8").newDecoder(); //$NON-NLS-1$
+        utf8Decoder.onMalformedInput(CodingErrorAction.REPORT);
+        utf8Decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
+
+        ArrayList<String> substrings = new ArrayList<String>();
 	
+        int lineLength = line.limit();
+        int sepCursor = 0;
+        int fieldCursor = 0;
+        byte[] fieldBytes = new byte[lineLength];
+        while (line.position() < line.limit()) {
+            if (sepCursor < sep.length) {
+                byte currentByte = line.get();
+                if (currentByte == sep[sepCursor]) {
+                    sepCursor++;
+                } else {
+                    sepCursor = 0;
+                    fieldBytes[fieldCursor++] = currentByte;
+                }
+            } else {
+                // we found a new field
+                if (fieldCursor > 0) {
+                    substrings.add(newStringFromSplit(decoder, utf8Decoder, encoding, fieldBytes, fieldCursor));
+                    fieldCursor = 0;
+                } else {
+                    // empty field
+                    substrings.add(""); //$NON-NLS-1$
+                }
+                sepCursor = 0;
+            }
+        }
+        if (fieldCursor > 0) {
+            substrings.add(newStringFromSplit(decoder, utf8Decoder, encoding, fieldBytes, fieldCursor));
+        }
+        if (sepCursor == sep.length) {
+            substrings.add(""); //$NON-NLS-1$
+        }
+
+        int resultSize = substrings.size();
+        if (resultSize == 0) {
+            // no delimiter found so we have only one column
+            String[] result = new String[1];
+            result[0] = new String(bline, encoding);
+            return result;
+        }
+        String[] result = new String[resultSize];
+        substrings.toArray(result);
+        return result;
+    }
+
 	/**
 	 * replace the method : String.split(String regex)
+     * 
 	 * @param str
 	 * @param separatorChars
 	 * @return
 	 */
-	public static String[] splitNotRegex(String str,String separatorChars) {
+    public static String[] splitNotRegex(String str, String separatorChars) {
 		if (str == null) {
            return null;
        }
@@ -63,7 +167,7 @@ public class StringUtils {
        }

        int resultSize = substrings.size();
-        while (resultSize > 0 && substrings.get(resultSize-1).equals("")) {
+        while (resultSize > 0 && substrings.get(resultSize - 1).equals("")) {
        	resultSize--;
        }
        String[] result = new String[resultSize];
@@ -236,7 +340,7 @@ public class StringUtils {
            } else {
                // case 3:
                if (replacement == null) {
-                    if ( (caseSensitive && src.equals(search)) || (!caseSensitive && src.equalsIgnoreCase(search))) {
+                    if ((caseSensitive && src.equals(search)) || (!caseSensitive && src.equalsIgnoreCase(search))) {
                        // regex != null && src != null && replacement != null, and match the whole src
                        return replacement;
                    } else {
@@ -305,8 +409,8 @@ public class StringUtils {
    
    
    /**
-     * return null value not "null" String when obj is null 
-     * that is the only difference with String.valueOf(Object obj)
+     * return null value not "null" String when obj is null that is the only difference with String.valueOf(Object obj)
+     * 
     * @param obj
     * @return
     */