1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package com.orangesignal.csv;
18
19 import java.io.BufferedReader;
20 import java.io.Closeable;
21 import java.io.IOException;
22 import java.io.InputStreamReader;
23 import java.io.Reader;
24 import java.nio.charset.Charset;
25 import java.util.ArrayList;
26 import java.util.List;
27 import java.util.regex.Pattern;
28
29
30
31
32
33
34
35 public class CsvReader implements Closeable {
36
37
38
39
40 private Reader in;
41
42
43
44
45 private CsvConfig cfg;
46
47
48
49
50 private final StringBuilder line = new StringBuilder();
51
52
53
54
55 private int nextChar = -1;
56
57
58
59
60 private int pos;
61
62
63
64
65 private boolean skiped;
66
67
68
69
70 private int startTokenLineNumber = 0;
71
72
73
74
75 private int endTokenLineNumber = 0;
76
77
78
79
80 private int startLineNumber = 0;
81
82
83
84
85 private int endLineNumber = 0;
86
87
88
89
90 private int lineNumber = 0;
91
92
93
94
95 private boolean endOfFile;
96
97
98
99
100 private boolean endOfLine;
101
102
103
104
105 private boolean cr = false;
106
107
108
109
110 private final boolean utf8bom;
111
112
113
114
115 private int countNumberOfColumns = -1;
116
117
118
119
120 private static final char CR = '\r';
121
122
123
124
125 private static final char LF = '\n';
126
127
128
129
130 private static final int BOM = 0xFEFF;
131
132 private static final int DEFAULT_CHAR_BUFFER_SIZE = 8192;
133
134
135
136
137
138
139
140
141
142
143
144
145
146 public CsvReader(final Reader in, final int sz, final CsvConfig cfg) {
147 if (cfg == null) {
148 throw new IllegalArgumentException("CsvConfig must not be null");
149 }
150 cfg.validate();
151 this.in = new BufferedReader(in, sz);
152 this.cfg = cfg;
153 final String s;
154 if (in instanceof InputStreamReader) {
155 s = ((InputStreamReader) in).getEncoding();
156 } else {
157 s = Charset.defaultCharset().name();
158 }
159 this.utf8bom = s.toLowerCase().matches("^utf\\-{0,1}8$");
160 }
161
162
163
164
165
166
167
168
169
170 public CsvReader(final Reader in, final CsvConfig cfg) {
171 this(in, DEFAULT_CHAR_BUFFER_SIZE, cfg);
172 }
173
174
175
176
177
178
179
180
181 public CsvReader(final Reader in, final int sz) {
182 this(in, sz, new CsvConfig());
183 }
184
185
186
187
188
189
190 public CsvReader(final Reader in) {
191 this(in, DEFAULT_CHAR_BUFFER_SIZE, new CsvConfig());
192 }
193
194
195
196
197
198
199
200
201 public int getStartLineNumber() { return startLineNumber; }
202
203
204
205
206
207
208 public int getEndLineNumber() { return endLineNumber; }
209
210
211
212
213
214
215 public int getLineNumber() { return lineNumber; }
216
217
218
219
220
221
222
223 public boolean isEndOfFile() { return endOfFile; }
224
225
226
227
228 private void ensureOpen() throws IOException {
229 if (in == null) {
230 throw new IOException("Reader closed");
231 }
232 }
233
234
235
236
237
238
239
240 private int cacheLine() throws IOException {
241
242 line.setLength(0);
243 int c;
244 if (nextChar != -1) {
245 c = nextChar;
246 nextChar = -1;
247 } else {
248 c = in.read();
249
250 if (lineNumber == 0 && utf8bom && c == BOM) {
251 c = in.read();
252 }
253 }
254
255 int result = -1;
256 while (c != -1) {
257 line.append((char) c);
258 if (c == CR) {
259 result = line.length();
260 nextChar = in.read();
261 if (nextChar == LF) {
262 line.append((char) nextChar);
263 nextChar = -1;
264 }
265 break;
266 } else if (c == LF) {
267 result = line.length();
268 break;
269 }
270 c = in.read();
271 }
272 pos = 0;
273
274 return result;
275 }
276
277
278
279
280
281
282
283 private int read() throws IOException {
284 synchronized (this) {
285 ensureOpen();
286 if (endOfFile) {
287 return -1;
288 }
289 if (line.length() == 0 || line.length() <= pos) {
290 cacheLine();
291 }
292 if (line.length() == 0) {
293 return -1;
294 }
295 return line.charAt(pos++);
296 }
297 }
298
299
300
301
302
303
304
305
306
307 public List<String> readValues() throws IOException {
308 final List<CsvToken> tokens = readTokens();
309 if (tokens == null) {
310 return null;
311 }
312 final List<String> results = new ArrayList<String>(tokens.size());
313 for (final CsvToken token : tokens) {
314 results.add(token.getValue());
315 }
316 return results;
317 }
318
319
320
321
322
323
324
325
326 public List<CsvToken> readTokens() throws IOException {
327 synchronized (this) {
328 ensureOpen();
329 if (endOfFile) {
330 return null;
331 }
332 if (!skiped) {
333 for (int i = 0; i < cfg.getSkipLines(); i++) {
334 cacheLine();
335 endTokenLineNumber++;
336 lineNumber++;
337 }
338 line.setLength(0);
339 skiped = true;
340 }
341 return readCsvTokens();
342 }
343 }
344
345 private int arraySize = 3;
346
347
348
349
350
351
352
353
354 private List<CsvToken> readCsvTokens() throws IOException {
355 final List<CsvToken> results = new ArrayList<CsvToken>(arraySize);
356 endTokenLineNumber++;
357 startLineNumber = endTokenLineNumber;
358 endOfLine = false;
359 do {
360 if (line.length() == 0 || line.length() <= pos) {
361 int breakLine = cacheLine();
362
363
364 if (cfg.isIgnoreEmptyLines()) {
365 boolean ignore = true;
366 while (ignore && line.length() > 0) {
367 ignore = false;
368 if (isWhitespaces(breakLine == -1 ? line : line.substring(0, breakLine - 1))) {
369 ignore = true;
370 endTokenLineNumber++;
371 startLineNumber = endTokenLineNumber;
372 lineNumber++;
373 breakLine = cacheLine();
374 }
375 }
376 }
377
378
379 if (cfg.getIgnoreLinePatterns() != null) {
380 boolean ignore = true;
381 while (ignore && line.length() > 0) {
382 ignore = false;
383 for (final Pattern p : cfg.getIgnoreLinePatterns()) {
384 if (p != null && p.matcher(breakLine == -1 ? line : line.substring(0, breakLine - 1)).matches()) {
385 ignore = true;
386 endTokenLineNumber++;
387 startLineNumber = endTokenLineNumber;
388 lineNumber++;
389 breakLine = cacheLine();
390 break;
391 }
392 }
393 }
394 }
395 }
396 startTokenLineNumber = endTokenLineNumber;
397 results.add(readCsvToken());
398 } while (!endOfLine);
399 endLineNumber = endTokenLineNumber;
400 lineNumber++;
401
402 arraySize = results.size();
403
404
405 if (arraySize == 1) {
406 if (endOfFile) {
407 final String value = results.get(0).getValue();
408 if (cfg.isIgnoreEmptyLines() && isWhitespaces(value)) {
409 return null;
410 }
411 if (cfg.getIgnoreLinePatterns() != null) {
412 for (final Pattern p : cfg.getIgnoreLinePatterns()) {
413 if (p != null && p.matcher(value).matches()) {
414 return null;
415 }
416 }
417 }
418 } else {
419 if (cfg.isIgnoreEmptyLines() && (line.length() == 0 || isWhitespaces(line))) {
420 return null;
421 }
422 }
423 }
424 if (!cfg.isVariableColumns()) {
425 if (countNumberOfColumns >= 0 && countNumberOfColumns != arraySize) {
426 throw new CsvTokenException(String.format("Invalid column count in CSV input on line %d.", startLineNumber), results);
427 }
428 countNumberOfColumns = arraySize;
429 }
430
431 return results;
432 }
433
434 private final StringBuilder buf = new StringBuilder();
435 private boolean inQuote = false;
436 private boolean enclosed = false;
437 private boolean escaped = false;
438 private boolean _escaped = false;
439
440
441
442
443
444
445
446 private CsvToken readCsvToken() throws IOException {
447 buf.setLength(0);
448
449 inQuote = false;
450 enclosed = false;
451 escaped = false;
452 _escaped = false;
453
454 endTokenLineNumber = startTokenLineNumber;
455
456 while (true) {
457 final int c = read();
458 if (cr) {
459 cr = false;
460 escaped = false;
461 if (c == LF) {
462 if (inQuote) {
463 buf.append((char) c);
464 }
465 continue;
466 }
467 } else if (_escaped && c == cfg.getSeparator()) {
468 buf.append((char) c);
469 _escaped = false;
470 continue;
471 }
472 _escaped = false;
473 if (c == -1) {
474 endOfLine = true;
475
476
477
478 endOfFile = true;
479 break;
480 }
481
482
483 if (!inQuote) {
484
485 if (c == cfg.getSeparator()) {
486 break;
487
488 } else if (c == CR) {
489 endOfLine = true;
490 cr = true;
491 break;
492
493 } else if (c == LF) {
494 endOfLine = true;
495 break;
496
497 } else if (!cfg.isQuoteDisabled() && !enclosed && c == cfg.getQuote()) {
498 if (isWhitespaces(buf)) {
499 inQuote = true;
500 }
501
502 } else if (cfg.isQuoteDisabled() && !cfg.isEscapeDisabled() && c == cfg.getEscape()) {
503 _escaped = true;
504 }
505
506 } else {
507
508 if (!cfg.isEscapeDisabled() && cfg.getQuote() == cfg.getEscape()) {
509
510 if (escaped) {
511
512 if (c == cfg.getSeparator()) {
513 break;
514 } else if (c == CR) {
515 endOfLine = true;
516 cr = true;
517 break;
518 } else if (c == LF) {
519 endOfLine = true;
520 break;
521 } else if (c == cfg.getEscape()) {
522 escaped = false;
523 buf.append((char) c);
524 continue;
525 }
526
527 } else if (c == cfg.getEscape()) {
528 escaped = true;
529 buf.append((char) c);
530 continue;
531 }
532 }
533
534
535 if (c == cfg.getQuote()) {
536 if (escaped) {
537
538 escaped = false;
539 } else {
540 inQuote = false;
541 enclosed = true;
542 }
543
544 } else if (c == CR) {
545 cr = true;
546 endTokenLineNumber++;
547
548 } else if (c == LF) {
549 endTokenLineNumber++;
550 }
551
552 if (!cfg.isEscapeDisabled() && c == cfg.getEscape()) {
553 escaped = true;
554 } else {
555 escaped = false;
556 }
557 }
558
559 buf.append((char) c);
560 }
561
562 if (escaped) {
563 enclosed = true;
564 }
565
566 String value = buf.toString();
567
568
569 if (enclosed) {
570
571 final int i = value.lastIndexOf(cfg.getQuote()) + 1;
572 assert i > 0;
573 if (i < value.length() && !isWhitespaces(value.substring(i + 1))) {
574 enclosed = false;
575 }
576 }
577
578 if (cfg.isIgnoreLeadingWhitespaces() || enclosed) {
579 value = removeLeadingWhitespaces(value);
580 }
581 if (cfg.isIgnoreTrailingWhitespaces() || enclosed) {
582 value = removeTrailingWhitespaces(value);
583 }
584 if (enclosed) {
585
586 value = value.substring(1, value.length() - 1);
587
588 if (cfg.getBreakString() != null) {
589 value = value.replaceAll("\r\n|\r|\n", cfg.getBreakString());
590 }
591
592 if (!cfg.isEscapeDisabled()) {
593 value = unescapeQuote(value);
594 }
595 } else {
596 if (cfg.getNullString() != null) {
597 if (cfg.isIgnoreCaseNullString()) {
598 if (cfg.getNullString().equalsIgnoreCase(value)) {
599 value = null;
600 }
601 } else {
602 if (cfg.getNullString().equals(value)) {
603 value = null;
604 }
605 }
606 }
607 if (value != null && !cfg.isEscapeDisabled()) {
608 value = unescapeSeparator(value);
609 }
610 }
611
612 return new SimpleCsvToken(value, startTokenLineNumber, endTokenLineNumber, enclosed);
613 }
614
615
616
617
618
619
620
621 private String unescapeQuote(final String value) {
622 return value.replace(
623 new StringBuilder(2).append(cfg.getEscape()).append(cfg.getQuote()),
624 new StringBuilder(1).append(cfg.getQuote())
625 );
626 }
627
628 private String unescapeSeparator(final String value) {
629 return value.replace(
630 new StringBuilder(2).append(cfg.getEscape()).append(cfg.getSeparator()),
631 new StringBuilder(1).append(cfg.getSeparator())
632 );
633 }
634
635
636
637 @Override
638 public void close() throws IOException {
639 synchronized (this) {
640 in.close();
641 in = null;
642 cfg = null;
643 line.setLength(0);
644 }
645 }
646
647
648
649
650
651
652
653
654
655 private static boolean isWhitespaces(final CharSequence value) {
656 final int len = value.length();
657 for (int i = 0; i < len; i++) {
658 if (!Character.isWhitespace(value.charAt(i))) {
659 return false;
660 }
661 }
662 return true;
663 }
664
665 private static String removeLeadingWhitespaces(final String value) {
666 final int len = value.length();
667 int pos = -1;
668 for (int i = 0; i < len; i++) {
669 if (!Character.isWhitespace(value.charAt(i))) {
670 pos = i;
671 break;
672 }
673 }
674 if (pos == -1) {
675 return "";
676 }
677 if (pos > 0) {
678 return value.substring(pos);
679 }
680 return value;
681 }
682
683 private static String removeTrailingWhitespaces(final String value) {
684 final int start = value.length() - 1;
685 int pos = -1;
686 for (int i = start; i >= 0; i--) {
687 if (!Character.isWhitespace(value.charAt(i))) {
688 pos = i;
689 break;
690 }
691 }
692 if (pos == -1) {
693 return "";
694 }
695 if (pos != start) {
696 return value.substring(0, pos + 1);
697 }
698 return value;
699 }
700
701 }