RFC 4180対応版 CSVレコードの分解 - 精進しないと。
2007/08/14
2007/08/17
csv
java
rfc
RFC 4180対応版 CSVレコードの分解 どう書く?org
あとで書く。
うーん、しばらく暇つぶしにことかかないなぁ
いろいろな人の回答みたのですが、みなさんコンパクトですね。
自分のJava版だとサイズが大きいなぁ
参考
Comma-Separated Values - Wikipedia
RFC 4180 Common Format and MIME Type for Comma-Separated Values (CSV) Files
2007-08-16 その2
アプローチはきわめてオーソドックスだと思います。
コンパイラの本とかのはじめのページあたりででてくる、字句解析の感じを目指しています。
トークンクラスです。
package csv;
public class Token {
TT type;
private StringBuffer val = new StringBuffer();
public Token build(TT type) {
this.type = type;
return this;
}
public void append(int ch) {
this.val.append((char) ch);
}
public void append(String s) {
this.val.append(s);
}
public String toString() {
return "type:[" + type + "] val:[" + val + "]";
}
}
public class Token {
TT type;
private StringBuffer val = new StringBuffer();
public Token build(TT type) {
this.type = type;
return this;
}
public void append(int ch) {
this.val.append((char) ch);
}
public void append(String s) {
this.val.append(s);
}
public String toString() {
return "type:[" + type + "] val:[" + val + "]";
}
}
トークンタイプのクラスです。enum使ってます。
package csv;
public enum TT {
EOF("EOF"), FIELD("FIELD"), COMMA("COMMA"), CRLF("CRLF");
String s;
TT(String s) {
this.s = s;
}
public String toString() {
return s;
}
}
public enum TT {
EOF("EOF"), FIELD("FIELD"), COMMA("COMMA"), CRLF("CRLF");
String s;
TT(String s) {
this.s = s;
}
public String toString() {
return s;
}
}
CSVトークナイザーという名前にしました。
whileの部分のstateは状態遷移図に起こしやすことを目指してます。が、厳密ではないので、あとで直します。
package csv;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.StringReader;
public class CSVTokenizer {
PushbackReader reader;
static final int DQUOTE = '"';
static final int COMMA = ',';
static final int EOF = -1;
static final int CR = '\r';
static final int LF = '\n';
public CSVTokenizer(String s) {
this.reader = new PushbackReader(new BufferedReader(new StringReader(s)));
}
public CSVTokenizer(InputStream inputStream) {
this.reader = new PushbackReader(new BufferedReader(new InputStreamReader(inputStream)));
}
public Token token() throws IOException {
int state = 0;
Token token = new Token();
loop: while (true) {
int ch = read();
switch (state) {
case 0:
/*
* -- START --
*/
if (ch == EOF) {
return token.build(TT.EOF);
}
if (ch == DQUOTE) {
state = 2;
token.type = TT.FIELD;
break;
}
if (ch == COMMA) {
// empty field
token.append(ch);
return token.build(TT.COMMA);
}
if (ch == CR) {
state = 4;
break;
}
state = 1;
token.type = TT.FIELD;
// break しない
case 1:
/*
* -- non-escaped --
*/
if (ch == EOF || ch == CR || ch == LF) {
unread(ch);
return token;
}
if (!isTextdata(ch)) {
unread(ch);
return token;
}
token.append(ch);
break;
case 2:
/*
* -- escaped(double quote) --
*/
if (ch == EOF) {
return token.build(TT.FIELD);
}
if (ch == DQUOTE) {
ch = read();
if (ch == DQUOTE) {
token.append("\"");
state = 2;
break;
}
unread(ch);
return token;
}
token.append(ch);
break;
case 3:
/*
* -- escaped(single quote) --
*/
break loop;
case 4:
if (ch == LF) {
return token.build(TT.CRLF);
}
if (ch == EOF) {
return token.build(TT.EOF);
}
default:
break loop;
}
}
return token;
}
boolean isTextdata(int ch) {
if (notEq(ch, '\r') && notEq(ch, '\n') && notEq(ch, '"')
&& notEq(ch, ',')) {
return true;
}
return false;
}
int read() throws IOException {
if (reader != null)
return reader.read();
return -1;
}
boolean notEq(int l, int r) {
return (l != r);
}
void unread(int ch) throws IOException {
if (reader != null && ch != -1) {
reader.unread(ch);
}
}
public void close() {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
}
}
}
}
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.StringReader;
public class CSVTokenizer {
PushbackReader reader;
static final int DQUOTE = '"';
static final int COMMA = ',';
static final int EOF = -1;
static final int CR = '\r';
static final int LF = '\n';
public CSVTokenizer(String s) {
this.reader = new PushbackReader(new BufferedReader(new StringReader(s)));
}
public CSVTokenizer(InputStream inputStream) {
this.reader = new PushbackReader(new BufferedReader(new InputStreamReader(inputStream)));
}
public Token token() throws IOException {
int state = 0;
Token token = new Token();
loop: while (true) {
int ch = read();
switch (state) {
case 0:
/*
* -- START --
*/
if (ch == EOF) {
return token.build(TT.EOF);
}
if (ch == DQUOTE) {
state = 2;
token.type = TT.FIELD;
break;
}
if (ch == COMMA) {
// empty field
token.append(ch);
return token.build(TT.COMMA);
}
if (ch == CR) {
state = 4;
break;
}
state = 1;
token.type = TT.FIELD;
// break しない
case 1:
/*
* -- non-escaped --
*/
if (ch == EOF || ch == CR || ch == LF) {
unread(ch);
return token;
}
if (!isTextdata(ch)) {
unread(ch);
return token;
}
token.append(ch);
break;
case 2:
/*
* -- escaped(double quote) --
*/
if (ch == EOF) {
return token.build(TT.FIELD);
}
if (ch == DQUOTE) {
ch = read();
if (ch == DQUOTE) {
token.append("\"");
state = 2;
break;
}
unread(ch);
return token;
}
token.append(ch);
break;
case 3:
/*
* -- escaped(single quote) --
*/
break loop;
case 4:
if (ch == LF) {
return token.build(TT.CRLF);
}
if (ch == EOF) {
return token.build(TT.EOF);
}
default:
break loop;
}
}
return token;
}
boolean isTextdata(int ch) {
if (notEq(ch, '\r') && notEq(ch, '\n') && notEq(ch, '"')
&& notEq(ch, ',')) {
return true;
}
return false;
}
int read() throws IOException {
if (reader != null)
return reader.read();
return -1;
}
boolean notEq(int l, int r) {
return (l != r);
}
void unread(int ch) throws IOException {
if (reader != null && ch != -1) {
reader.unread(ch);
}
}
public void close() {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
}
}
}
}
テスト書かないときちんと動いているかわからないですね。
2007-08-16 その1
最初の試み。その2をあとで書こう。全然厳密ではない。
まず、トークンに分けて、トークンを処理するという流れで。
package csv;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.StringReader;
public class A {
public static void main(String[] args) throws IOException {
a();
}
public static void a() throws IOException {
String s;
CSVTokenizer tokenizer = new CSVTokenizer("aaa");
print(tokenizer);
System.out.println("-------------");
tokenizer = new CSVTokenizer("aaa,bbb,ccc");
System.out.println("-------------");
print(tokenizer);
tokenizer = new CSVTokenizer("aaa,bbb,ccc\r\n");
System.out.println("-------------");
print(tokenizer);
tokenizer = new CSVTokenizer(" aaa , bbb , ccc \r\n");
System.out.println("-------------");
print(tokenizer);
tokenizer = new CSVTokenizer("日本語, bbb , ccc \r\n日本語2, bbb2 , ccc2");
System.out.println("-------------");
print(tokenizer);
System.out.println("------------- check double quote.");
tokenizer = new CSVTokenizer("\"日本語\",\" bbb \",\" ccc \r\n日本語2\", bbb2 , ccc2");
print(tokenizer);
System.out.println("------------- ");
s = "\"日本語\",\" bbb \",\" ccc \r\n日本語2\", bbb2 , ccc2\r\n二行目aaa,二行目 bbb, \"二行目ccc \"\"OK OK \"";
System.out.println(s);
tokenizer = new CSVTokenizer(s);
print(tokenizer);
}
static void print(CSVTokenizer tokenizer) throws IOException {
Token token = null;
do {
token = tokenizer.token();
System.out.println(token);
} while (token != null && token.type != TT_EOF);
}
static final int TT_EOF = -1;
static final int TT_FIELD = 0;
static final int TT_COMMA = 1;
static final int TT_CRLF = 2;
// --
static final int DQUOTE = '"';
static final int COMMA = ',';
static final int EOF = -1;
static final int CR = '\r';
static final int LF = '\n';
static class Token {
int type;
private StringBuffer val = new StringBuffer();
public Token build(int type) {
this.type = type;
return this;
}
public void append(int ch) {
this.val.append((char) ch);
}
public void append(String s) {
this.val.append(s);
}
public String toString() {
return "type:[" + type + "] val:[" + val + "]";
}
}
static class CSVTokenizer {
PushbackReader reader;
public CSVTokenizer(String s) {
this.reader = new PushbackReader(new StringReader(s));
}
public Token token() throws IOException {
int state = 0;
Token token = new Token();
loop: while (true) {
int ch = read();
// if(ch == -1) System.exit(0);
switch (state) {
case 0:
/*
* -- START --
*/
if (ch == EOF) {
return token.build(TT_EOF);
}
if (ch == DQUOTE) {
state = 2;
token.type = TT_FIELD;
break;
}
if (ch == COMMA) {
// empty field
token.append(ch);
return token.build(TT_COMMA);
}
if (ch == CR) {
state = 4;
break;
}
state = 1;
token.type = TT_FIELD;
// break しない
case 1:
/*
* -- non-escaped --
*/
if (ch == EOF || ch == CR || ch == LF) {
unread(ch);
return token;
}
if (!isTextdata(ch)) {
unread(ch);
return token;
}
token.append(ch);
break;
case 2:
/*
* -- escaped(double quote) --
*/
if (ch == DQUOTE) {
ch = (char) reader.read();
if (ch == DQUOTE) {
token.append("\"");
state = 2;
break;
}
unread(ch);
return token;
}
if (ch == EOF) {
unread(ch);
return token.build(TT_FIELD);
}
token.append(ch);
break;
case 3:
/*
* -- escaped(single quote) --
*/
break loop;
case 4:
if (ch == LF) {
return token.build(TT_CRLF);
}
if (ch == EOF) {
return token.build(TT_EOF);
}
default:
break loop;
}
}
return token;
}
boolean isTextdata(int ch) {
if (not(ch, '\r') && not(ch, '\n') && not(ch, '"') && not(ch, ',')) {
return true;
}
return false;
}
int read() throws IOException{
if(reader != null) return reader.read();
return -1;
}
boolean not(int l, int r) {
return (l != r);
}
void unread(int ch) throws IOException{
if(reader != null && ch != -1){
reader.unread(ch);
}
}
}
}
import java.io.IOException;
import java.io.PushbackReader;
import java.io.StringReader;
public class A {
public static void main(String[] args) throws IOException {
a();
}
public static void a() throws IOException {
String s;
CSVTokenizer tokenizer = new CSVTokenizer("aaa");
print(tokenizer);
System.out.println("-------------");
tokenizer = new CSVTokenizer("aaa,bbb,ccc");
System.out.println("-------------");
print(tokenizer);
tokenizer = new CSVTokenizer("aaa,bbb,ccc\r\n");
System.out.println("-------------");
print(tokenizer);
tokenizer = new CSVTokenizer(" aaa , bbb , ccc \r\n");
System.out.println("-------------");
print(tokenizer);
tokenizer = new CSVTokenizer("日本語, bbb , ccc \r\n日本語2, bbb2 , ccc2");
System.out.println("-------------");
print(tokenizer);
System.out.println("------------- check double quote.");
tokenizer = new CSVTokenizer("\"日本語\",\" bbb \",\" ccc \r\n日本語2\", bbb2 , ccc2");
print(tokenizer);
System.out.println("------------- ");
s = "\"日本語\",\" bbb \",\" ccc \r\n日本語2\", bbb2 , ccc2\r\n二行目aaa,二行目 bbb, \"二行目ccc \"\"OK OK \"";
System.out.println(s);
tokenizer = new CSVTokenizer(s);
print(tokenizer);
}
static void print(CSVTokenizer tokenizer) throws IOException {
Token token = null;
do {
token = tokenizer.token();
System.out.println(token);
} while (token != null && token.type != TT_EOF);
}
static final int TT_EOF = -1;
static final int TT_FIELD = 0;
static final int TT_COMMA = 1;
static final int TT_CRLF = 2;
// --
static final int DQUOTE = '"';
static final int COMMA = ',';
static final int EOF = -1;
static final int CR = '\r';
static final int LF = '\n';
static class Token {
int type;
private StringBuffer val = new StringBuffer();
public Token build(int type) {
this.type = type;
return this;
}
public void append(int ch) {
this.val.append((char) ch);
}
public void append(String s) {
this.val.append(s);
}
public String toString() {
return "type:[" + type + "] val:[" + val + "]";
}
}
static class CSVTokenizer {
PushbackReader reader;
public CSVTokenizer(String s) {
this.reader = new PushbackReader(new StringReader(s));
}
public Token token() throws IOException {
int state = 0;
Token token = new Token();
loop: while (true) {
int ch = read();
// if(ch == -1) System.exit(0);
switch (state) {
case 0:
/*
* -- START --
*/
if (ch == EOF) {
return token.build(TT_EOF);
}
if (ch == DQUOTE) {
state = 2;
token.type = TT_FIELD;
break;
}
if (ch == COMMA) {
// empty field
token.append(ch);
return token.build(TT_COMMA);
}
if (ch == CR) {
state = 4;
break;
}
state = 1;
token.type = TT_FIELD;
// break しない
case 1:
/*
* -- non-escaped --
*/
if (ch == EOF || ch == CR || ch == LF) {
unread(ch);
return token;
}
if (!isTextdata(ch)) {
unread(ch);
return token;
}
token.append(ch);
break;
case 2:
/*
* -- escaped(double quote) --
*/
if (ch == DQUOTE) {
ch = (char) reader.read();
if (ch == DQUOTE) {
token.append("\"");
state = 2;
break;
}
unread(ch);
return token;
}
if (ch == EOF) {
unread(ch);
return token.build(TT_FIELD);
}
token.append(ch);
break;
case 3:
/*
* -- escaped(single quote) --
*/
break loop;
case 4:
if (ch == LF) {
return token.build(TT_CRLF);
}
if (ch == EOF) {
return token.build(TT_EOF);
}
default:
break loop;
}
}
return token;
}
boolean isTextdata(int ch) {
if (not(ch, '\r') && not(ch, '\n') && not(ch, '"') && not(ch, ',')) {
return true;
}
return false;
}
int read() throws IOException{
if(reader != null) return reader.read();
return -1;
}
boolean not(int l, int r) {
return (l != r);
}
void unread(int ch) throws IOException{
if(reader != null && ch != -1){
reader.unread(ch);
}
}
}
}
: