いまだにCSVその2
2010/09/18
csv
java
Javaです。CSVを処理します。
とりあえず、JUnit4(使ったことがなかったので)でテスト用意して、自分のニーズに対応しているだろうことを確認。
これでCSVから離れられる....
テスト用データ、ソースは全部、コミット。
CSVUtils.java - quicklunch - Project Hosting on Google Code
package quicklunch.e2.goodies.utils;
import static org.junit.Assert.*;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import org.junit.Test;
public class CSVUtilsTest {
static InputStream getStream(String filename) {
return CSVUtilsTest.class.getResourceAsStream("doc-files/csv/"
+ filename);
}
@Test
public void testParse001() throws IOException {
// fail("Not yet implemented");
CSVUtils.parse(getStream("001.csv"), new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
assertEquals(0, row);
assertEquals(3, line.size());
}
});
CSVUtils.parse(getStream("002.csv"), new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
assertEquals(0, row);
assertEquals(5, line.size());
assertEquals("", line.get(0));// empty field
assertEquals("bbb", line.get(1));
assertEquals("", line.get(2));
assertEquals("ccc", line.get(3));
assertEquals("", line.get(4));
}
});
CSVUtils.parse(getStream("003.csv"), new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
if (0 == row) {
assertEquals(3, line.size());
assertEquals("aaa", line.get(0));
assertEquals("bbb", line.get(1));
assertEquals("ccc", line.get(2));
}
if (1 == row) {
assertEquals(4, line.size());
assertEquals("000", line.get(0));
assertEquals(" 111 ", line.get(1));
// space
assertEquals(" 222", line.get(2));
// space
assertEquals("333 ", line.get(3));
}
}
});
}
@Test
public void testParse002() throws IOException {
// fail("Not yet implemented");
CSVUtils.parse(getStream("004.csv"), new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
if (row == 0) {
// empty line
assertEquals(0, line.size());
}
if (row == 1) {
assertEquals(1, line.size());
// space
assertEquals(" ", line.get(0));
}
if (row == 2) {
assertEquals(2, line.size());
assertEquals("", line.get(0));// empty field
assertEquals("", line.get(1));// empty field
}
if (row == 3) {
assertEquals(2, line.size());
assertEquals(" ", line.get(0));// space
assertEquals("", line.get(1));// empty field
}
if (row == 4) {
// empty line
assertEquals(0, line.size());
}
if (row == 5) {
assertEquals(1, line.size());
assertEquals(" ", line.get(0));// space
}
}
});
}
@Test
public void testParse003() throws IOException {
// fail("Not yet implemented");
{
long n = CSVUtils.parse("日本語", new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
assertEquals(0, row);
assertEquals(1, line.size());
assertEquals("日本語", line.get(0));
}
});
assertEquals(1, n);
}
{
long n = CSVUtils.parse("", new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
assertEquals(0, row);
assertEquals(0, line.size());
}
});
assertEquals(0, n);
}
{
long n = CSVUtils.parse("日本語\n", new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
if (row == 0) {
assertEquals(1, line.size());
assertEquals("日本語", line.get(0));
}
}
});
assertEquals(1, n);
}
}
@Test
public void testParse004() throws IOException {
{
// encode
long n = CSVUtils.parse(getStream("005_utf8_crlf.csv"),
new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
if (0 == row) {
assertEquals(5, line.size());
assertEquals("日本語1", line.get(0));
assertEquals("日本語2", line.get(1));
assertEquals("日本語3", line.get(2));
assertEquals(" 日本語4", line.get(3));// space
assertEquals("日本語5 ", line.get(4));// space
}
if (1 == row) {
assertEquals(5, line.size());
assertEquals("dd\r\neeee", line.get(0));
assertEquals("fff", line.get(1));
assertEquals(" gg ", line.get(2));
assertEquals("", line.get(3));// epmty field
assertEquals("hhh", line.get(4));
}
if (2 == row) {
assertEquals(5, line.size());
assertEquals("aaa", line.get(0));
assertEquals("bbb\r\nッッ", line.get(1));
assertEquals("ccc", line.get(2));
assertEquals("dddd", line.get(3));
assertEquals("", line.get(4));// epmty field
}
}
}, "utf8");
assertEquals(3, n);
}
}
}
import static org.junit.Assert.*;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import org.junit.Test;
public class CSVUtilsTest {
static InputStream getStream(String filename) {
return CSVUtilsTest.class.getResourceAsStream("doc-files/csv/"
+ filename);
}
@Test
public void testParse001() throws IOException {
// fail("Not yet implemented");
CSVUtils.parse(getStream("001.csv"), new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
assertEquals(0, row);
assertEquals(3, line.size());
}
});
CSVUtils.parse(getStream("002.csv"), new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
assertEquals(0, row);
assertEquals(5, line.size());
assertEquals("", line.get(0));// empty field
assertEquals("bbb", line.get(1));
assertEquals("", line.get(2));
assertEquals("ccc", line.get(3));
assertEquals("", line.get(4));
}
});
CSVUtils.parse(getStream("003.csv"), new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
if (0 == row) {
assertEquals(3, line.size());
assertEquals("aaa", line.get(0));
assertEquals("bbb", line.get(1));
assertEquals("ccc", line.get(2));
}
if (1 == row) {
assertEquals(4, line.size());
assertEquals("000", line.get(0));
assertEquals(" 111 ", line.get(1));
// space
assertEquals(" 222", line.get(2));
// space
assertEquals("333 ", line.get(3));
}
}
});
}
@Test
public void testParse002() throws IOException {
// fail("Not yet implemented");
CSVUtils.parse(getStream("004.csv"), new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
if (row == 0) {
// empty line
assertEquals(0, line.size());
}
if (row == 1) {
assertEquals(1, line.size());
// space
assertEquals(" ", line.get(0));
}
if (row == 2) {
assertEquals(2, line.size());
assertEquals("", line.get(0));// empty field
assertEquals("", line.get(1));// empty field
}
if (row == 3) {
assertEquals(2, line.size());
assertEquals(" ", line.get(0));// space
assertEquals("", line.get(1));// empty field
}
if (row == 4) {
// empty line
assertEquals(0, line.size());
}
if (row == 5) {
assertEquals(1, line.size());
assertEquals(" ", line.get(0));// space
}
}
});
}
@Test
public void testParse003() throws IOException {
// fail("Not yet implemented");
{
long n = CSVUtils.parse("日本語", new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
assertEquals(0, row);
assertEquals(1, line.size());
assertEquals("日本語", line.get(0));
}
});
assertEquals(1, n);
}
{
long n = CSVUtils.parse("", new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
assertEquals(0, row);
assertEquals(0, line.size());
}
});
assertEquals(0, n);
}
{
long n = CSVUtils.parse("日本語\n", new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
if (row == 0) {
assertEquals(1, line.size());
assertEquals("日本語", line.get(0));
}
}
});
assertEquals(1, n);
}
}
@Test
public void testParse004() throws IOException {
{
// encode
long n = CSVUtils.parse(getStream("005_utf8_crlf.csv"),
new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
if (0 == row) {
assertEquals(5, line.size());
assertEquals("日本語1", line.get(0));
assertEquals("日本語2", line.get(1));
assertEquals("日本語3", line.get(2));
assertEquals(" 日本語4", line.get(3));// space
assertEquals("日本語5 ", line.get(4));// space
}
if (1 == row) {
assertEquals(5, line.size());
assertEquals("dd\r\neeee", line.get(0));
assertEquals("fff", line.get(1));
assertEquals(" gg ", line.get(2));
assertEquals("", line.get(3));// epmty field
assertEquals("hhh", line.get(4));
}
if (2 == row) {
assertEquals(5, line.size());
assertEquals("aaa", line.get(0));
assertEquals("bbb\r\nッッ", line.get(1));
assertEquals("ccc", line.get(2));
assertEquals("dddd", line.get(3));
assertEquals("", line.get(4));// epmty field
}
}
}, "utf8");
assertEquals(3, n);
}
}
}
package quicklunch.e2.goodies.utils;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
/**
*
*
* @author nakawakashigeto
*
*/
public abstract class CSVUtils {
public interface IExecutor {
public void pre();
/**
*
* @param row
* start 0
* @param line
*/
public void exec(long row, List<String> line);
public void post();
}
abstract static public class AbstractExecutor implements IExecutor {
public void pre() {
}
public void exec(long row, List<String> line) {
}
public void post() {
}
}// end
// ===================
public enum TT {
EOF("EOF"), FIELD("FIELD"), COMMA("COMMA"), CRLF("CRLF"), CR("CR"), LF(
"LF");
String s;
TT(String s) {
this.s = s;
}
public String toString() {
return s;
}
}
/**
*
* @author nakawakashigeto
*
*/
public static class Token {
TT type;
public StringBuilder val = new StringBuilder();
public Token build(TT type) {
this.type = type;
return this;
}
public void append(int ch) {
this.val.append((char) ch);
}
public void append(String s) {
this.val.append(s);
}
public String toString() {
return "T:[" + type + "] V:[" + val + "]";
}
}
/**
*
* @author nakawakashigeto
*
*/
public static class CSVTokenizer {
PushbackReader reader;
static final int DQUOTE = '"';
static final int QUOTE = '\'';
static final int COMMA = ',';
static final int EOF = -1;
static final int CR = '\r';
static final int LF = '\n';
/* STATE */
static final int ST_nonescaped = 1;
static final int ST_escaped = 2;
static final int ST_escaped_single_quote = 3;
public CSVTokenizer(String s) {
this.reader = new PushbackReader(new BufferedReader(
new StringReader(s)));
}
public CSVTokenizer(InputStream inputStream) {
this.reader = new PushbackReader(new BufferedReader(
new InputStreamReader(inputStream)));
}
public CSVTokenizer(InputStream inputStream, String charasetname)
throws UnsupportedEncodingException {
this.reader = new PushbackReader(new BufferedReader(
new InputStreamReader(inputStream, charasetname)));
}
public Token token() throws IOException {
int state = 0;
Token token = new Token();
loop: while (true) {
int ch = read();
switch (state) {
case 0:
/*
* -- START --
*/
if (ch == EOF) {
return token.build(TT.EOF);
}
// dpuble quote
if (ch == DQUOTE) {
state = ST_escaped;
token.type = TT.FIELD;
break;
}
// single quote
if (ch == QUOTE) {
state = ST_escaped_single_quote;
token.type = TT.FIELD;
break;
}
if (ch == COMMA) {
token.append(ch);
return token.build(TT.COMMA);
}
if (ch == CR) {
ch = read();
if (ch == LF) {
// default CRLF
return token.build(TT.CRLF);
}
// suport CR
unread(ch);
return token.build(TT.CR);
}
// suport LF
if (ch == LF) {
return token.build(TT.LF);
}
state = ST_nonescaped;
token.type = TT.FIELD;
case ST_nonescaped:
/*
* -- non-escaped --
*/
if (ch == EOF || ch == CR || ch == LF || ch == DQUOTE) {
unread(ch);
return token;
}
if (!isTextdata(ch)) {
unread(ch);
return token;
}
token.append(ch);
break;
case ST_escaped:
/*
* -- escaped(double quote) --
*/
if (ch == EOF) {
return token.build(TT.FIELD);
}
// 2DQUOTE
if (ch == DQUOTE) {
ch = read();
if (ch == DQUOTE) {
token.append("\"");
state = ST_escaped;
break;
}
unread(ch);
return token;
}
token.append(ch);
break;
case ST_escaped_single_quote:
/*
* -- escaped(single quote) --
*/
if (ch == EOF) {
return token.build(TT.FIELD);
}
// 2DQUOTE
if (ch == QUOTE) {
ch = read();
if (ch == QUOTE) {
token.append("\'");
state = ST_escaped_single_quote;
break;
}
unread(ch);
return token;
}
token.append(ch);
break;
default:
break loop;
}
}
return token;
}
boolean isTextdata(int ch) {
if (notEq(ch, '\r') && notEq(ch, '\n') && notEq(ch, '"')
&& notEq(ch, ',')) {
return true;
}
return false;
}
int read() throws IOException {
if (reader != null)
return reader.read();
return -1;
}
boolean notEq(int l, int r) {
return (l != r);
}
void unread(int ch) throws IOException {
if (reader != null && ch != -1) {
reader.unread(ch);
}
}
public void close() {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
}
}
}
} // end
public static long parse(InputStream inputStream, IExecutor executor,
String charasetname) throws IOException {
long row = 0;
executor.pre();
try {
CSVTokenizer tokenizer = new CSVTokenizer(inputStream, charasetname);
CSVUtils.Token token = null;
// one previous token
CSVUtils.Token preToken = null;
do {
List<String> line = new ArrayList<String>();
while ((token = tokenizer.token()) != null
&& !(token.type == TT.EOF || token.type == TT.CRLF
|| token.type == TT.CR || token.type == TT.LF)) {
// check empty field.
if ((preToken == null || preToken.type == TT.COMMA)
&& token.type == TT.COMMA) {
line.add("");// empty field...
preToken = token;
continue;
}
if (token.type == TT.COMMA) {
preToken = token;
continue;// skip comma
}
line.add(token.val.toString());
preToken = token;
}
// ignore empty line
if (preToken == null && token.type == TT.EOF) {
break;
}
// check empty field.
if ((preToken != null && preToken.type == TT.COMMA)
&& (token.type == TT.EOF || token.type == TT.CRLF
|| token.type == TT.CR || token.type == TT.LF)) {
line.add("");// empty field...
}
executor.exec(row++, line);
preToken = null;
} while (token != null && token.type != TT.EOF);
} finally {
executor.post();
}
return row;
}
public static long parse(InputStream inputStream, IExecutor executor)
throws IOException {
return parse(inputStream, executor, System.getProperty("file.encoding"));
}
public static long parse(String s, IExecutor executor) throws IOException {
InputStream inputStream = new ByteArrayInputStream(s.getBytes("utf-8"));
return parse(inputStream, executor, "utf-8");
}
}
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
/**
*
*
* @author nakawakashigeto
*
*/
public abstract class CSVUtils {
public interface IExecutor {
public void pre();
/**
*
* @param row
* start 0
* @param line
*/
public void exec(long row, List<String> line);
public void post();
}
abstract static public class AbstractExecutor implements IExecutor {
public void pre() {
}
public void exec(long row, List<String> line) {
}
public void post() {
}
}// end
// ===================
public enum TT {
EOF("EOF"), FIELD("FIELD"), COMMA("COMMA"), CRLF("CRLF"), CR("CR"), LF(
"LF");
String s;
TT(String s) {
this.s = s;
}
public String toString() {
return s;
}
}
/**
*
* @author nakawakashigeto
*
*/
public static class Token {
TT type;
public StringBuilder val = new StringBuilder();
public Token build(TT type) {
this.type = type;
return this;
}
public void append(int ch) {
this.val.append((char) ch);
}
public void append(String s) {
this.val.append(s);
}
public String toString() {
return "T:[" + type + "] V:[" + val + "]";
}
}
/**
*
* @author nakawakashigeto
*
*/
public static class CSVTokenizer {
PushbackReader reader;
static final int DQUOTE = '"';
static final int QUOTE = '\'';
static final int COMMA = ',';
static final int EOF = -1;
static final int CR = '\r';
static final int LF = '\n';
/* STATE */
static final int ST_nonescaped = 1;
static final int ST_escaped = 2;
static final int ST_escaped_single_quote = 3;
public CSVTokenizer(String s) {
this.reader = new PushbackReader(new BufferedReader(
new StringReader(s)));
}
public CSVTokenizer(InputStream inputStream) {
this.reader = new PushbackReader(new BufferedReader(
new InputStreamReader(inputStream)));
}
public CSVTokenizer(InputStream inputStream, String charasetname)
throws UnsupportedEncodingException {
this.reader = new PushbackReader(new BufferedReader(
new InputStreamReader(inputStream, charasetname)));
}
public Token token() throws IOException {
int state = 0;
Token token = new Token();
loop: while (true) {
int ch = read();
switch (state) {
case 0:
/*
* -- START --
*/
if (ch == EOF) {
return token.build(TT.EOF);
}
// dpuble quote
if (ch == DQUOTE) {
state = ST_escaped;
token.type = TT.FIELD;
break;
}
// single quote
if (ch == QUOTE) {
state = ST_escaped_single_quote;
token.type = TT.FIELD;
break;
}
if (ch == COMMA) {
token.append(ch);
return token.build(TT.COMMA);
}
if (ch == CR) {
ch = read();
if (ch == LF) {
// default CRLF
return token.build(TT.CRLF);
}
// suport CR
unread(ch);
return token.build(TT.CR);
}
// suport LF
if (ch == LF) {
return token.build(TT.LF);
}
state = ST_nonescaped;
token.type = TT.FIELD;
case ST_nonescaped:
/*
* -- non-escaped --
*/
if (ch == EOF || ch == CR || ch == LF || ch == DQUOTE) {
unread(ch);
return token;
}
if (!isTextdata(ch)) {
unread(ch);
return token;
}
token.append(ch);
break;
case ST_escaped:
/*
* -- escaped(double quote) --
*/
if (ch == EOF) {
return token.build(TT.FIELD);
}
// 2DQUOTE
if (ch == DQUOTE) {
ch = read();
if (ch == DQUOTE) {
token.append("\"");
state = ST_escaped;
break;
}
unread(ch);
return token;
}
token.append(ch);
break;
case ST_escaped_single_quote:
/*
* -- escaped(single quote) --
*/
if (ch == EOF) {
return token.build(TT.FIELD);
}
// 2DQUOTE
if (ch == QUOTE) {
ch = read();
if (ch == QUOTE) {
token.append("\'");
state = ST_escaped_single_quote;
break;
}
unread(ch);
return token;
}
token.append(ch);
break;
default:
break loop;
}
}
return token;
}
boolean isTextdata(int ch) {
if (notEq(ch, '\r') && notEq(ch, '\n') && notEq(ch, '"')
&& notEq(ch, ',')) {
return true;
}
return false;
}
int read() throws IOException {
if (reader != null)
return reader.read();
return -1;
}
boolean notEq(int l, int r) {
return (l != r);
}
void unread(int ch) throws IOException {
if (reader != null && ch != -1) {
reader.unread(ch);
}
}
public void close() {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
}
}
}
} // end
public static long parse(InputStream inputStream, IExecutor executor,
String charasetname) throws IOException {
long row = 0;
executor.pre();
try {
CSVTokenizer tokenizer = new CSVTokenizer(inputStream, charasetname);
CSVUtils.Token token = null;
// one previous token
CSVUtils.Token preToken = null;
do {
List<String> line = new ArrayList<String>();
while ((token = tokenizer.token()) != null
&& !(token.type == TT.EOF || token.type == TT.CRLF
|| token.type == TT.CR || token.type == TT.LF)) {
// check empty field.
if ((preToken == null || preToken.type == TT.COMMA)
&& token.type == TT.COMMA) {
line.add("");// empty field...
preToken = token;
continue;
}
if (token.type == TT.COMMA) {
preToken = token;
continue;// skip comma
}
line.add(token.val.toString());
preToken = token;
}
// ignore empty line
if (preToken == null && token.type == TT.EOF) {
break;
}
// check empty field.
if ((preToken != null && preToken.type == TT.COMMA)
&& (token.type == TT.EOF || token.type == TT.CRLF
|| token.type == TT.CR || token.type == TT.LF)) {
line.add("");// empty field...
}
executor.exec(row++, line);
preToken = null;
} while (token != null && token.type != TT.EOF);
} finally {
executor.post();
}
return row;
}
public static long parse(InputStream inputStream, IExecutor executor)
throws IOException {
return parse(inputStream, executor, System.getProperty("file.encoding"));
}
public static long parse(String s, IExecutor executor) throws IOException {
InputStream inputStream = new ByteArrayInputStream(s.getBytes("utf-8"));
return parse(inputStream, executor, "utf-8");
}
}
: