いまだにCSVその2 2010/09/18

Javaです。CSVを処理します。
とりあえず、JUnit4(使ったことがなかったので)でテスト用意して、自分のニーズに対応しているだろうことを確認。
これでCSVから離れられる....
テスト用データ、ソースは全部、コミット。
CSVUtils.java - quicklunch - Project Hosting on Google Code

package quicklunch.e2.goodies.utils;

import static org.junit.Assert.*;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;

import org.junit.Test;

public class CSVUtilsTest {

static InputStream getStream(String filename) {
return CSVUtilsTest.class.getResourceAsStream("doc-files/csv/"
+ filename);
}

@Test
public void testParse001() throws IOException {
// fail("Not yet implemented");
CSVUtils.parse(getStream("001.csv"), new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
assertEquals(0, row);
assertEquals(3, line.size());
}
});

CSVUtils.parse(getStream("002.csv"), new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
assertEquals(0, row);
assertEquals(5, line.size());
assertEquals("", line.get(0));// empty field
assertEquals("bbb", line.get(1));
assertEquals("", line.get(2));
assertEquals("ccc", line.get(3));
assertEquals("", line.get(4));
}
});

CSVUtils.parse(getStream("003.csv"), new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {

if (0 == row) {
assertEquals(3, line.size());
assertEquals("aaa", line.get(0));
assertEquals("bbb", line.get(1));
assertEquals("ccc", line.get(2));
}

if (1 == row) {
assertEquals(4, line.size());
assertEquals("000", line.get(0));
assertEquals(" 111 ", line.get(1));
// space
assertEquals(" 222", line.get(2));
// space
assertEquals("333 ", line.get(3));
}
}
});
}

@Test
public void testParse002() throws IOException {
// fail("Not yet implemented");
CSVUtils.parse(getStream("004.csv"), new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
if (row == 0) {
// empty line
assertEquals(0, line.size());
}
if (row == 1) {
assertEquals(1, line.size());
// space
assertEquals(" ", line.get(0));
}

if (row == 2) {
assertEquals(2, line.size());
assertEquals("", line.get(0));// empty field
assertEquals("", line.get(1));// empty field
}

if (row == 3) {
assertEquals(2, line.size());
assertEquals(" ", line.get(0));// space
assertEquals("", line.get(1));// empty field
}

if (row == 4) {
// empty line
assertEquals(0, line.size());
}

if (row == 5) {
assertEquals(1, line.size());
assertEquals(" ", line.get(0));// space
}
}
});
}

@Test
public void testParse003() throws IOException {
// fail("Not yet implemented");
{
long n = CSVUtils.parse("日本語", new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
assertEquals(0, row);
assertEquals(1, line.size());
assertEquals("日本語", line.get(0));
}
});
assertEquals(1, n);
}
{
long n = CSVUtils.parse("", new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
assertEquals(0, row);
assertEquals(0, line.size());
}
});
assertEquals(0, n);
}
{
long n = CSVUtils.parse("日本語\n", new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {
if (row == 0) {
assertEquals(1, line.size());
assertEquals("日本語", line.get(0));
}
}
});
assertEquals(1, n);
}
}

@Test
public void testParse004() throws IOException {
{
// encode
long n = CSVUtils.parse(getStream("005_utf8_crlf.csv"),
new CSVUtils.AbstractExecutor() {
@Override
public void exec(long row, List<String> line) {

if (0 == row) {
assertEquals(5, line.size());
assertEquals("日本語1", line.get(0));
assertEquals("日本語2", line.get(1));
assertEquals("日本語3", line.get(2));
assertEquals(" 日本語4", line.get(3));// space
assertEquals("日本語5 ", line.get(4));// space
}

if (1 == row) {
assertEquals(5, line.size());
assertEquals("dd\r\neeee", line.get(0));
assertEquals("fff", line.get(1));
assertEquals(" gg ", line.get(2));
assertEquals("", line.get(3));// epmty field
assertEquals("hhh", line.get(4));
}

if (2 == row) {
assertEquals(5, line.size());
assertEquals("aaa", line.get(0));
assertEquals("bbb\r\nッッ", line.get(1));
assertEquals("ccc", line.get(2));
assertEquals("dddd", line.get(3));
assertEquals("", line.get(4));// epmty field
}
}
}, "utf8");

assertEquals(3, n);
}
}

}



package quicklunch.e2.goodies.utils;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;


/**
*
*
* @author nakawakashigeto
*
*/
public abstract class CSVUtils {

public interface IExecutor {
public void pre();

/**
*
* @param row
* start 0
* @param line
*/
public void exec(long row, List<String> line);

public void post();
}

abstract static public class AbstractExecutor implements IExecutor {
public void pre() {
}

public void exec(long row, List<String> line) {
}

public void post() {
}
}// end

// ===================

public enum TT {
EOF("EOF"), FIELD("FIELD"), COMMA("COMMA"), CRLF("CRLF"), CR("CR"), LF(
"LF");

String s;

TT(String s) {
this.s = s;
}

public String toString() {
return s;
}
}

/**
*
* @author nakawakashigeto
*
*/
public static class Token {
TT type;
public StringBuilder val = new StringBuilder();

public Token build(TT type) {
this.type = type;
return this;
}

public void append(int ch) {
this.val.append((char) ch);
}

public void append(String s) {
this.val.append(s);
}

public String toString() {
return "T:[" + type + "] V:[" + val + "]";
}
}

/**
*
* @author nakawakashigeto
*
*/
public static class CSVTokenizer {

PushbackReader reader;

static final int DQUOTE = '"';
static final int QUOTE = '\'';
static final int COMMA = ',';
static final int EOF = -1;
static final int CR = '\r';
static final int LF = '\n';

/* STATE */
static final int ST_nonescaped = 1;
static final int ST_escaped = 2;
static final int ST_escaped_single_quote = 3;

public CSVTokenizer(String s) {
this.reader = new PushbackReader(new BufferedReader(
new StringReader(s)));
}

public CSVTokenizer(InputStream inputStream) {
this.reader = new PushbackReader(new BufferedReader(
new InputStreamReader(inputStream)));
}

public CSVTokenizer(InputStream inputStream, String charasetname)
throws UnsupportedEncodingException {
this.reader = new PushbackReader(new BufferedReader(
new InputStreamReader(inputStream, charasetname)));
}

public Token token() throws IOException {

int state = 0;

Token token = new Token();
loop: while (true) {
int ch = read();

switch (state) {
case 0:
/*
* -- START --
*/
if (ch == EOF) {
return token.build(TT.EOF);
}

// dpuble quote
if (ch == DQUOTE) {
state = ST_escaped;
token.type = TT.FIELD;
break;
}

// single quote
if (ch == QUOTE) {
state = ST_escaped_single_quote;
token.type = TT.FIELD;
break;
}

if (ch == COMMA) {
token.append(ch);
return token.build(TT.COMMA);
}

if (ch == CR) {
ch = read();
if (ch == LF) {
// default CRLF
return token.build(TT.CRLF);
}

// suport CR
unread(ch);
return token.build(TT.CR);
}

// suport LF
if (ch == LF) {
return token.build(TT.LF);
}

state = ST_nonescaped;
token.type = TT.FIELD;
case ST_nonescaped:
/*
* -- non-escaped --
*/
if (ch == EOF || ch == CR || ch == LF || ch == DQUOTE) {
unread(ch);
return token;
}

if (!isTextdata(ch)) {
unread(ch);
return token;
}

token.append(ch);
break;
case ST_escaped:
/*
* -- escaped(double quote) --
*/

if (ch == EOF) {
return token.build(TT.FIELD);
}

// 2DQUOTE
if (ch == DQUOTE) {
ch = read();
if (ch == DQUOTE) {
token.append("\"");
state = ST_escaped;
break;
}
unread(ch);
return token;
}

token.append(ch);
break;

case ST_escaped_single_quote:
/*
* -- escaped(single quote) --
*/
if (ch == EOF) {
return token.build(TT.FIELD);
}

// 2DQUOTE
if (ch == QUOTE) {
ch = read();
if (ch == QUOTE) {
token.append("\'");
state = ST_escaped_single_quote;
break;
}
unread(ch);
return token;
}

token.append(ch);
break;

default:
break loop;
}
}

return token;
}

boolean isTextdata(int ch) {
if (notEq(ch, '\r') && notEq(ch, '\n') && notEq(ch, '"')
&& notEq(ch, ',')) {
return true;
}
return false;
}

int read() throws IOException {
if (reader != null)
return reader.read();
return -1;
}

boolean notEq(int l, int r) {
return (l != r);
}

void unread(int ch) throws IOException {
if (reader != null && ch != -1) {
reader.unread(ch);
}
}

public void close() {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
}
}
}
} // end

public static long parse(InputStream inputStream, IExecutor executor,
String charasetname) throws IOException {
long row = 0;

executor.pre();

try {
CSVTokenizer tokenizer = new CSVTokenizer(inputStream, charasetname);
CSVUtils.Token token = null;
// one previous token
CSVUtils.Token preToken = null;

do {
List<String> line = new ArrayList<String>();

while ((token = tokenizer.token()) != null
&& !(token.type == TT.EOF || token.type == TT.CRLF
|| token.type == TT.CR || token.type == TT.LF)) {
// check empty field.
if ((preToken == null || preToken.type == TT.COMMA)
&& token.type == TT.COMMA) {
line.add("");// empty field...
preToken = token;
continue;
}

if (token.type == TT.COMMA) {
preToken = token;
continue;// skip comma
}
line.add(token.val.toString());
preToken = token;
}

// ignore empty line
if (preToken == null && token.type == TT.EOF) {
break;
}

// check empty field.
if ((preToken != null && preToken.type == TT.COMMA)
&& (token.type == TT.EOF || token.type == TT.CRLF
|| token.type == TT.CR || token.type == TT.LF)) {
line.add("");// empty field...
}

executor.exec(row++, line);
preToken = null;
} while (token != null && token.type != TT.EOF);
} finally {
executor.post();
}

return row;
}

public static long parse(InputStream inputStream, IExecutor executor)
throws IOException {
return parse(inputStream, executor, System.getProperty("file.encoding"));
}

public static long parse(String s, IExecutor executor) throws IOException {

InputStream inputStream = new ByteArrayInputStream(s.getBytes("utf-8"));

return parse(inputStream, executor, "utf-8");
}

}

: