2017-12-05

各言語でUTF-8バイト列を文字列置換および文字列分割してみる

Java C C++ PHP Python Ruby Perl Go bash

各言語でUTF-8のバイト列を読み込み、文字列置換と文字列分割をしてみたメモ。

要件は以下の通り。

標準入力から、文字列が1行だけ入力される。
- 文字エンコーディングはUTF-8
- 入力文字数は高々10文字とする
標準出力に、以下の2つを改行区切りで出力する。
- 文字列の各文字をすべて'.'で置き換えた文字列
- 入力文字列の各文字を改行で区切ったもの
つまり、10文字の文字列が入力されたら、出力は11行になる

環境

手元にあるものということで、環境は以下のものに限定する。

CentOS 7
- Java (openjdk version "1.8.0_151")
- C (gcc (GCC) 4.8.5)
  - -std=gnu11でコンパイル
- C++ (g++ (GCC) 4.8.5)
  - -std=gnu++1yでコンパイル
- PHP (PHP 5.4.16 (cli))
- Python 2 (Python 2.7.5)
- Python 3 (Python 3.6.3)
  - ソースからビルドしたもの
- Ruby (ruby 2.0.0p648)
- Perl (v5.16.3)
- Go (go version go1.8.3 linux/amd64)
- bash (4.2.46(1)-release)

Java

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;

public class Main {
    public static void main(String[] args) {
        try (BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
            PrintWriter out = new PrintWriter(System.out)
        ) {
            String s = in.readLine();

            // 文字列置換
            out.println(s.replaceAll(".", "."));

            // 文字列分割
            for (int i = 0; i < s.length(); ++i) {
                char ch1 = s.charAt(i);
                out.print(ch1);
                if (Character.isSurrogate(ch1)) {
                    ++i;
                    char ch2 = s.charAt(i);
                    out.print(ch2);
                }
                out.println();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

C

#include <stdio.h>
#include <string.h>
#include <locale.h>
#include <stdlib.h>
#include <regex.h>

int main(int argc, char** argv) {
    setlocale(LC_ALL, "ja_JP.UTF-8");

    char str[1024];

    fgets(str, sizeof(str), stdin);
    while (str[strlen(str) - 1] == '\n' || str[strlen(str) - 1] == '\r') {
        str[strlen(str) - 1] = '\0';
    }

    regex_t rb;
    if (regcomp(&rb, ".", REG_EXTENDED | REG_NEWLINE)) {
        perror("regcomp");
        return 1;
    }

    const char* p;
    regmatch_t rm;
    int err;
    int idx;

    // 文字列置換
    p = str;
    idx = 0;
    do {
        err = regexec(&rb, p + idx, 1, &rm, 0);
        if (!err) {
            if (rm.rm_so > 0) {
                char buf[1024];
                memset(buf, '\0', sizeof(buf));
                strncpy(buf, p + idx, rm.rm_so);
                fprintf(stdout, "%s", buf);
            }
            fprintf(stdout, ".");
            idx += rm.rm_eo;
        }
    } while (!err);
    fprintf(stdout, "%s\n", p + idx);

    // 文字列分割
    p = str;
    idx = 0;
    do {
        err = regexec(&rb, p + idx, 1, &rm, 0);
        if (!err) {
            char buf[1024];
            memset(buf, '\0', sizeof(buf));
            strncpy(buf, p + idx + rm.rm_so, rm.rm_eo - rm.rm_so);
            fprintf(stdout, "%s\n", buf);
            idx += rm.rm_eo;
        }
    } while (!err);

    regfree(&rb);

    return 0;
}

C++

#include <iostream>
#include <locale>
#include <string>

using namespace std;

int main(int argc, char** argv) {
    setlocale(LC_ALL, "ja_JP.UTF-8");
    wcout.imbue(locale("japanese"));

    wstring str;
    getline(wcin, str);

    // 文字列置換(ごまかし)
    //   regex_matchが完全マッチにしか対応してなくて使えないので。
    for (int i = 0; i < str.length(); ++i) {
        wcout << L".";
    }
    wcout << endl;

    // 文字列分割(ごまかし)
    //   regex_matchが完全マッチにしか対応してなくて使えないので。
    for (int i = 0; i < str.length(); ++i) {
        wcout << str[i] << endl;
    }

    return EXIT_SUCCESS;
}

(2017/12/05追記)

「yum install boost-devel」してBoostのライブラリを使うようにしたらまともに動いてくれたので、そのソースコードを追記。コンパイル時に「-lboost_regex」が必要。

#include <iostream>
#include <locale>
#include <string>
#include <boost/regex.hpp>

using namespace std;

int main(int argc, char** argv) {
    setlocale(LC_ALL, "ja_JP.UTF-8");
    wcout.imbue(locale("japanese"));

    wstring str;
    getline(wcin, str);

    boost::wregex re(L".");

    // 文字列置換
    wcout << boost::regex_replace(str, re, L".") << endl;

    // 文字列分割
    boost::wsmatch sm;
    wstring::const_iterator start = str.begin();
    wstring::const_iterator end = str.end();
    int offset = 0;
    while (boost::regex_search(start + offset, end, sm, re)) {
        size_t idx = 0;
        for (int i = 0; i < sm.length(idx); ++i) {
            wcout << str[sm.position(idx) + offset + i];
        }
        wcout << endl;
        offset += sm.position(idx) + sm.length(idx);
    }

    return EXIT_SUCCESS;
}

PHP

<?php

$str = file_get_contents('php://stdin');
$str = trim($str);

mb_regex_encoding('UTF-8');

// 文字列置換
//   mb_xxx系ではereg版しかない
//   パターンの書き方がpreg系の関数と違うことに注意・・
echo mb_ereg_replace('.', '.', $str) . PHP_EOL;

// 文字列分割
//   mb_xxx系ではereg版しかない
//   パターンの書き方がpreg系の関数と違うことに注意・・
$tmp = $str;
do {
    mb_ereg_search_init($tmp, '.');
    $range = mb_ereg_search_pos();
    if ($range !== false) {
        echo substr($tmp, $range[0], $range[1]) . PHP_EOL;
        $tmp = substr($tmp, $range[1]);
    }
} while ($tmp !== false && $range !== false);

Python 2

# -*- coding: UTF-8 -*-
import sys
import re

s = sys.stdin.readline()
ustr = unicode(s, 'UTF-8')
ustr = ustr.replace('\n', '')
ustr = ustr.replace('\r', '')

# 文字列置換
print re.sub(r'.', '.', ustr)

# 文字列分割
for i in range(0, len(ustr)):
    print ustr[i].encode('UTF-8')

Python 3

# -*- coding: UTF-8 -*-
import sys
import re

b = sys.stdin.buffer.readline()
s = str(b, 'UTF-8')
s = s.replace('\n', '')
s = s.replace('\r', '')

# 文字列置換
print(re.sub(r'.', '.', s))

# 文字列分割
for i in range(0, len(s)):
    print(s[i])

Ruby

str = STDIN.gets
str.chomp!()

# 文字列置換
print str.gsub(/./, '.'),"\n"

# 文字列分割
for i in 0...str.size()
    print str[i],"\n"
end

Perl

use Encode;

my $str = readline(STDIN);
chomp($str);

# 文字列置換
my $ustr = decode('UTF-8', $str);
my $tmp = $ustr;
$tmp =~ s/././g;
print $tmp,"\n";

# 文字列分割
my $tmp = $ustr;
for (my $i = 0; $i < length($ustr); ++$i) {
    print encode('UTF-8', substr($tmp, $i, 1)),"\n";
}

Go

package main

import (
    "fmt"
    "os"
    "io"
    "bufio"
    "regexp"
)

func ReadLine(reader *bufio.Reader) (s string, err error) {
    prefix := false
    buf := make([]byte, 0)
    var line []byte
    for {
        line, prefix, err = reader.ReadLine()
        if err == io.EOF {
            return
        }
        buf = append(buf, line...)
        if prefix {
            continue
        }
        s = string(buf)
        return
    }
}

func main() {
    stdin := bufio.NewReader(os.Stdin)
    s, _ := ReadLine(stdin)

    ss := regexp.MustCompile(`.`).ReplaceAllString(s, ".")
    fmt.Println(ss)

    runes := []rune(s)
    for i := 0; i < len(runes); i += 1 {
        fmt.Println(string(runes[i]))
    }
}

bash

#! /bin/bash

IFS= read s

echo "${s}" | sed -e 's/././g'

echo -n "${s}" | sed -e 's/\(.\)/\1\n/g'

2017-12-04

各言語でUTF-8バイト列からバイト数と文字数を取ってみる

Java C C++ PHP Python Ruby Perl Go bash

各言語でUTF-8のバイト列を読み込み、バイト数とUnicodeでの文字数を取得してみたメモ。

要件は以下の通り。

標準入力から、文字列が1行だけ入力される。
- 文字エンコーディングはUTF-8
- 入力文字数は高々10文字とする
標準出力に、以下の3つを改行区切りで出力する。
- 文字列の総バイト数
- 長さ
- 入力文字列そのもの

環境

手元にあるものということで、環境は以下のものに限定する。

CentOS 7
- Java (openjdk version "1.8.0_151")
- C (gcc (GCC) 4.8.5)
  - -std=gnu11でコンパイル
- C++ (g++ (GCC) 4.8.5)
  - -std=gnu++1yでコンパイル
- PHP (PHP 5.4.16 (cli))
- Python 2 (Python 2.7.5)
- Python 3 (Python 3.6.3)
  - ソースからビルドしたもの
- Ruby (ruby 2.0.0p648)
- Perl (v5.16.3)
- Go (go version go1.8.3 linux/amd64)
- bash (4.2.46(1)-release)

Java

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;

public class Main {
    public static void main(String[] args) {
        try (BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
            PrintWriter out = new PrintWriter(System.out)
        ) {
            String s = in.readLine();
            out.println(s.getBytes().length);
            out.println(s.codePointCount(0, s.length()));
            out.println(s);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

Javaは、歴史的経緯から、サロゲートペアをcharで表すことができないので、文字数を知りたいときにString.length()を呼んではダメ。

C

#include <stdio.h>
#include <string.h>
#include <locale.h>
#include <wchar.h>
#include <stdlib.h>

int main(int argc, char** argv) {
    setlocale(LC_ALL, "ja_JP.UTF-8");

    char str[1024];

    fgets(str, sizeof(str), stdin);
    while (str[strlen(str) - 1] == '\n' || str[strlen(str) - 1] == '\r') {
        str[strlen(str) - 1] = '\0';
    }

    fprintf(stdout, "%d\n", strlen(str));

    wchar_t buf[1024];
    const char* p = str;
    mbsrtowcs(buf, &p, sizeof(buf), NULL);
    fprintf(stdout, "%d\n", wcslen(buf));

    fprintf(stdout, "%s\n", str);

    return 0;
}

C++

#include <iostream>
#include <cwchar>
#include <clocale>
#include <string>
#include <cstring>

using namespace std;

int main(int argc, char** argv) {
    setlocale(LC_ALL, "ja_JP.UTF-8");

    wstring str;
    getline(wcin, str);

    char cbuf[1024];
    wcstombs(cbuf, str.c_str(), sizeof(cbuf));
    wcout << strlen(cbuf) << endl;

    wcout << str.length() << endl;

    wcout << str << endl;

    return EXIT_SUCCESS;
}

PHP

<?php

$str = file_get_contents('php://stdin');
$str = trim($str);

echo strlen($str) . PHP_EOL;

echo mb_strlen($str, 'UTF-8') . PHP_EOL;

echo $str . PHP_EOL;

Python 2

import sys

s = sys.stdin.readline()
s = s.replace('\n', '')
s = s.replace('\r', '')

print len(s)

ustr = unicode(s, 'UTF-8')
print len(ustr)

print s

Python 3

import sys

b = sys.stdin.buffer.readline()
s = str(b, 'UTF-8')
s = s.replace('\n', '')
s = s.replace('\r', '')
b = bytes(s, 'UTF-8')

print(len(b))

print(len(s))

print(s)

Ruby

str = STDIN.gets
str.chomp!()

print str.bytes().size(),"\n"

print str.size(),"\n"

print str,"\n"

Perl

use Encode;

my $str = readline(STDIN);
chomp($str);

print length($str),"\n";

my $b = $str;
$b = decode('UTF-8', $b);
print length($b),"\n";

print $str,"\n";

Go

package main

import (
    "fmt"
    "os"
    "io"
    "bufio"
)

func ReadLine(reader *bufio.Reader) (s string, err error) {
    prefix := false
    buf := make([]byte, 0)
    var line []byte
    for {
        line, prefix, err = reader.ReadLine()
        if err == io.EOF {
            return
        }
        buf = append(buf, line...)
        if prefix {
            continue
        }
        s = string(buf)
        return
    }
}

func main() {
    stdin := bufio.NewReader(os.Stdin)
    s, _ := ReadLine(stdin)

    fmt.Println(len(s))

    runes := []rune(s)
    fmt.Println(len(runes))

    fmt.Println(s)
}

bash

#! /bin/bash

IFS= read s

echo -n "${s}" | wc -c

echo ${#s}

echo "${s}"

まさか、サロゲートペアを含む文字列の文字数をシェルスクリプトでちゃんと取れるとは思ってなかった。

2017-12-03

各言語でファイル入出力＋文字エンコーディング変換

Java C C++ PHP Python Ruby Perl Go bash

各言語でファイル入出力と文字エンコーディング変換を書いてみたメモ。

やってる途中で、別々のエントリに分けた方が良かったかもと思ったりもしたが、例えばJavaなんかは内部的には「文字」はUTF-8だったりして入出力と文字エンコーディング変換が深くかかわっていたりするので、まぁいいかということで。

要件は以下の通り。

コマンドライン引数として、ディレクトリのパスを渡す
- ディレクトリ内に「in.txt」という、EUC-JPなファイルが置いてある
「in.txt」を読み込み、文字エンコーディングをShift_JISに変換して、「out.txt」として同じディレクトリに書き込む

環境

手元にあるものということで、環境は以下のものに限定する。

CentOS 7
- Java (openjdk version "1.8.0_151")
- C (gcc (GCC) 4.8.5)
  - -std=gnu11でコンパイル
- C++ (g++ (GCC) 4.8.5)
  - -std=gnu++1yでコンパイル
- PHP (PHP 5.4.16 (cli))
- Python 2 (Python 2.7.5)
- Python 3 (Python 3.6.3)
  - ソースからビルドしたもの
- Ruby (ruby 2.0.0p648)
- Perl (v5.16.3)
- Go (go version go1.8.3 linux/amd64)
- bash (4.2.46(1)-release)

Java

java.nio.charsetパッケージのCharsetDecoder / CharsetEncoderの存在を今更知ったので、2パターン書いてみた。

パターン1
- ファイル読み込み時にInputStreamReaderで文字エンコーディング変換
- ファイル書き込み時にOutputStreamWriterで文字エンコーディング変換
パターン2
- 読み込んだバイト列をCharsetDecoderで文字列に変換
- 文字列をCharsetEncoderでバイト列に変換して書き込み

パターン1

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;

public class Main {
    public static void main(String[] args) {
        if (args.length != 1) {
            System.err.println("Usage: java Main dirname");
            System.exit(1);
            return;
        }

        File inFile = new File(args[0], "in.txt");
        File outFile = new File(args[0], "out.txt");

        try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(inFile), "EUC-JP"));
            PrintWriter out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outFile), "Windows-31J"))
        ) {
            char[] buf = new char[1024];
            int len;
            while ((len = in.read(buf)) > 0) {
                out.write(buf, 0, len);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

パターン2

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;

public class Main {
    public static void main(String[] args) {
        if (args.length != 1) {
            System.err.println("Usage: java Main dirname");
            System.exit(1);
            return;
        }

        File inFile = new File(args[0], "in.txt");
        File outFile = new File(args[0], "out.txt");

        CharsetDecoder decoder = Charset.forName("EUC-JP").newDecoder();
        decoder.reset();
        CharsetEncoder encoder = Charset.forName("Windows-31J").newEncoder();
        encoder.reset();

        try (InputStream in = new BufferedInputStream(new FileInputStream(inFile));
            PrintStream out = new PrintStream(new FileOutputStream(outFile))
        ) {
            ByteBuffer inBuf = ByteBuffer.allocate(1024);
            ByteBuffer outBuf = ByteBuffer.allocate(1024);
            CharBuffer tmpBuf = CharBuffer.allocate(1024);
            byte[] buf = new byte[1024];
            int len;
            while ((len = in.read(buf, 0, Math.min(buf.length, inBuf.remaining()))) > 0) {
                inBuf.put(buf, 0, len);
                inBuf.flip();
                tmpBuf.clear();
                CoderResult res = decoder.decode(inBuf, tmpBuf, false);
                if (res.isUnderflow()) {
                    inBuf.compact();
                    tmpBuf.flip();
                    outBuf.clear();
                    encoder.encode(tmpBuf, outBuf, false);
                    outBuf.flip();
                    outBuf.get(buf, 0, outBuf.limit());
                    out.write(buf, 0, outBuf.limit());
                }
            }
            /* flush()するためのダミー処理 */
            inBuf.clear();
            inBuf.flip();
            tmpBuf.clear();
            decoder.decode(inBuf, tmpBuf, true);
            tmpBuf.flip();
            outBuf.clear();
            encoder.encode(tmpBuf, outBuf, true);
            outBuf.flip();
            outBuf.get(buf, 0, outBuf.limit());
            out.write(buf, 0, outBuf.limit());

            /* flush() */
            tmpBuf.clear();
            decoder.flush(tmpBuf);
            tmpBuf.flip();
            outBuf.clear();
            encoder.flush(outBuf);
            outBuf.flip();
            outBuf.get(buf, 0, outBuf.limit());
            out.write(buf, 0, outBuf.limit());
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

なんか、CharsetDecoder / CharsetEncoderの挙動を理解するのにすごく時間が掛かった、というか、そもそもByteBuffer / CharBufferの挙動もよく分からんかった。今でも、上記プログラムでほんとに正しいのか全く自信が無い‥

C

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iconv.h>
#include <limits.h>
#include <errno.h>

#define BUF_SIZE 1024

int main(int argc, char** argv) {
    if (argc != 2) {
        fprintf(stderr, "Usage: %s dirname\n", argv[0]);
        return 1;
    }

    /* ファイルのオープン */
    char inFile[PATH_MAX];
    sprintf(inFile, "%s/in.txt", argv[1]);
    char outFile[PATH_MAX];
    sprintf(outFile, "%s/out.txt", argv[1]);

    FILE* inFp = fopen(inFile, "r");
    if (!inFp) {
        perror(inFile);
        return 1;
    }
    FILE* outFp = fopen(outFile, "w");
    if (!outFp) {
        perror(outFile);    
        fclose(inFp);
        return 1;
    }

    /* 文字エンコーディング変換の準備 */
    iconv_t iconvHandler = iconv_open("CP932", "EUC-JP");

    /* 入力を読み込んで文字エンコーディング変換して出力 */
    char inBuf[BUF_SIZE];
    size_t inBufLeft = 0;
    char outBuf[BUF_SIZE];
    int len;
    while ((len = fread(inBuf + inBufLeft, sizeof(char), sizeof(inBuf) - inBufLeft, inFp)) + inBufLeft > 0) {
        inBufLeft += len;
        char* inPtr = inBuf;
        char* outPtr = outBuf;
        size_t outBufLeft = sizeof(outBuf);

        int rc = iconv(iconvHandler, &inPtr, &inBufLeft, &outPtr, &outBufLeft);
        if (rc == -1 && (errno == EILSEQ || errno == E2BIG)) {
            perror("iconv");
            break;
        }
        fwrite(outBuf, sizeof(char), sizeof(outBuf) - outBufLeft, outFp);

        if (inBufLeft > 0) {
            strncpy(inBuf, inPtr, inBufLeft);
        }
    }
    iconv_close(iconvHandler);

    /* ファイルのクローズ */
    fclose(inFp);
    fclose(outFp);

    return 0;
}

iconvの挙動を理解するのにすごい苦労した。というか、ネット上に転がっていたサンプルをいくつか試してみたが、入力データにちょっと細工したりするとすぐにテストケースでNGが出たりして使えないということになり、結局manページを熟読して理解したという・・・

Man page of ICONV

C++

C言語で書いたプログラムのうち、ファイル入出力の部分をC++版にしてみたもの。

#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <iconv.h>
#include <limits.h>
#include <errno.h>

#define BUF_SIZE 1024

using namespace std;

int main(int argc, char** argv) {
    if (argc != 2) {
        fprintf(stderr, "Usage: %s dirname\n", argv[0]);
        return EXIT_FAILURE;
    }

    /* ファイルのオープン */
    char inFile[PATH_MAX];
    sprintf(inFile, "%s/in.txt", argv[1]);
    char outFile[PATH_MAX];
    sprintf(outFile, "%s/out.txt", argv[1]);

    ifstream inFs(inFile, ios::binary);
    if (inFs.fail()) {
        perror(inFile);
        return EXIT_FAILURE;
    }
    ofstream outFs(outFile, ios::binary);
    if (outFs.fail()) {
        perror(outFile);    
        return EXIT_FAILURE;
    }

    /* 文字エンコーディング変換の準備 */
    iconv_t iconvHandler = iconv_open("CP932", "EUC-JP");

    /* 入力を読み込んで文字エンコーディング変換して出力 */
    char inBuf[BUF_SIZE / 2];
    size_t inBufLeft = 0;
    char outBuf[BUF_SIZE * 2];
    streamsize len;
    while ((len = inFs.readsome(inBuf + inBufLeft, sizeof(inBuf) - inBufLeft)) + inBufLeft > 0) {
        inBufLeft += len;
        char* inPtr = inBuf;
        char* outPtr = outBuf;
        size_t outBufLeft = sizeof(outBuf);

        int rc = iconv(iconvHandler, &inPtr, &inBufLeft, &outPtr, &outBufLeft);
        if (rc == -1 && (errno == EILSEQ || errno == E2BIG)) {
            perror("iconv");
            break;
        }
        outFs.write(outBuf, sizeof(outBuf) - outBufLeft);

        if (inBufLeft > 0) {
            strncpy(inBuf, inPtr, inBufLeft);
        }
    }
    iconv_close(iconvHandler);

    return EXIT_SUCCESS;
}

PHP

2パターン思いついたので、それぞれ書いてみた。

パターン1
- ファイルI/Oは、fopen / fgets / fputs / fclose
- 文字エンコーディング変換は、mb_convert_encoding
パターン2
- ファイルI/Oは、file_get_contents / file_put_contents
- 文字エンコーディング変換は、iconv

パターン1

<?php

if (count($argv) < 2) {
    file_put_contents('php://stderr', "Usage: php {$argv[0]} dirname" . PHP_EOL);
    exit(1);
}
$dir = $argv[1];
if (!file_exists($dir) || !is_dir($dir)) {
    file_put_contents('php://stderr', "{$dir}: No such directory." . PHP_EOL);
    exit(1);
}

$inFile = "{$dir}/in.txt";
$outFile = "{$dir}/out.txt";

$inFp = fopen($inFile, 'rb');
if (!$inFp) {
    file_put_contents('php://stderr', "{$inFile}: Cannot open file." . PHP_EOL);
    exit(1);
}
$outFp = fopen($outFile, 'wb');
if (!$outFp) {
    file_put_contents('php://stderr', "{$outFile}: Cannot open file." . PHP_EOL);
    fclose($inFp);
    exit(1);
}

while (($line = fgets($inFp))) {
    $line = mb_convert_encoding($line, 'SJIS-win', 'eucJP-win');
    fputs($outFp, $line);
}

fclose($inFp);
fclose($outFp);

パターン2

<?php

if (count($argv) < 2) {
    file_put_contents('php://stderr', "Usage: php {$argv[0]} dirname" . PHP_EOL);
    exit(1);
}

$dir = $argv[1];
if (!is_dir($dir)) {
    file_put_contents('php://stderr', "{$dir}: No such directory." . PHP_EOL);
    exit(1);
}

$inFile = "{$dir}/in.txt";
$outFile = "{$dir}/out.txt";
if (!file_exists($inFile)) {
    file_put_contents('php://stderr', "{$inFile}: No such file." . PHP_EOL);
    exit(1);
}

$str = file_get_contents($inFile);

$str = iconv('eucJP-win', 'SJIS-win', $str);

file_put_contents($outFile, $str);

Python 2

import sys
import os
import codecs

if len(sys.argv) < 2:
    sys.stderr.write("Usage: " + sys.argv[0] + " dirname\n")
    exit(1)

dirname = sys.argv[1]

inFile = dirname + "/in.txt"
outFile = dirname + "/out.txt"
if not os.path.isfile(inFile):
    sys.stderr.write(inFile + ": No such file\n")
    exit(1)

inFp = codecs.open(inFile, 'rb', 'EUC-JP')
outFp = codecs.open(outFile, 'wb', 'CP932')

for line in inFp:
    outFp.write(line)

inFp.close()
outFp.close()

Python 3

import sys
import os
import codecs

if len(sys.argv) < 2:
    sys.stderr.write("Usage: " + sys.argv[0] + " dirname\n")
    exit(1)

dirname = sys.argv[1]

inFile = dirname + "/in.txt"
outFile = dirname + "/out.txt"
if not os.path.isfile(inFile):
    sys.stderr.write(inFile + ": No such file\n")
    exit(1)

inFp = codecs.open(inFile, 'rb', 'EUC-JP')
outFp = codecs.open(outFile, 'wb', 'CP932')

for line in inFp:
    outFp.write(line)

inFp.close()
outFp.close()

Python 2と何ら変わりはない。

Ruby

if ARGV.length != 1
    STDERR.puts('Usage: ' + __FILE__ + ' dirname')
    exit 1
end

dir = ARGV[0]

inFile = dir + '/in.txt'
outFile = dir + '/out.txt'
if !File.file?(inFile)
    STDERR.puts(inFile + ': No such file')
    exit 1
end

inFp = File.open(inFile, mode = 'rb')
outFp = File.open(outFile, mode = 'wb')

inFp.each_line{|line|
    line.encode!('CP932', 'EUC-JP')
    outFp.puts(line)
}

inFp.close()
outFp.close()

Perl

パターン1
- ファイルオープン時に文字エンコーディングを指定
パターン2
- 読み込んだ文字列をencode / decodeで文字エンコーディング変換

パターン1

if (@ARGV < 1) {
    die("Usage: " . __FILE__ . " dirname");
}

my $dir = $ARGV[0];
my $inFile = $dir . "/in.txt";
my $outFile = $dir . "/out.txt";

open(inFp, "<:encoding(EUC-JP)", $inFile) or die($inFile . ": $!");
open(outFp, ">:encoding(CP932)", $outFile) or die($outFile . ": $!");

while (my $line = <inFp>) {
    print outFp $line;
}

close(inFp);
close(outFp);

パターン2

use Encode;

if (@ARGV < 1) {
    die("Usage: " . __FILE__ . " dirname");
}

my $dir = $ARGV[0];
my $inFile = $dir . "/in.txt";
my $outFile = $dir . "/out.txt";

open(inFp, "<", $inFile) or die($inFile . ": $!");
open(outFp, ">", $outFile) or die($outFile . ": $!");

while (my $line = <inFp>) {
    $line = encode('CP932', decode('EUC-JP', $line));
    print outFp $line;
}

close(inFp);
close(outFp);

Go

package main

import (
    "fmt"
    "os"
    "io"
    "bufio"

    "golang.org/x/text/encoding/japanese"
    "golang.org/x/text/transform"
)

func main() {
    if len(os.Args) < 2 {
        fmt.Fprintln(os.Stderr, "Usage: " + os.Args[0] + " dirname")
        os.Exit(1)
    }

    dir := os.Args[1]

    inFile := dir + "/in.txt"
    outFile := dir + "/out.txt"

    inFp, err := os.Open(inFile)
    if err != nil {
        panic(err)
    }
    outFp, err := os.Create(outFile)
    if err != nil {
        inFp.Close()
        panic(err)
    }

    in := bufio.NewReader(transform.NewReader(inFp, japanese.EUCJP.NewDecoder()))
    out := bufio.NewWriter(transform.NewWriter(outFp, japanese.ShiftJIS.NewEncoder()))

    b := make([]byte, 1024)
    for {
        n, err := in.Read(b)
        if err == io.EOF {
            break
        } else if err != nil {
            inFp.Close()
            outFp.Close()
            panic(err)
        }
        out.Write(b[0:n])
    }
    out.Flush()

    inFp.Close()
    outFp.Close()
}

これをやるには「GOPATH=$(pwd) go get golang.org/x/text/encoding/japanese」しておく必要がある。更に実行時に「GOPATH=$(pwd) go run Main.go」することも忘れずに。

Golang による文字エンコーディング変換 - Qiita

bash

#! /bin/bash

dir="${1}"
if [ -z "${dir}" ]; then
    echo "Usage: $0 dirname" >> /dev/stderr
    exit 1
fi
if [ ! -d "${dir}" ]; then
    echo "${dir}: No such directory." >> /dev/stderr
    exit 1
fi

iconv -f EUC-JP -t CP932 < "${dir}/in.txt" > "${dir}/out.txt"

2017-12-02

各言語の標準エラー出力

Java C C++ PHP Python Ruby Perl Go bash Awk

そういえば今まで意識しなかったな、ということで、各言語の標準エラー出力を使ってみたメモ。

環境

手元にあるものということで、環境は以下のものに限定する。

CentOS 7
- Java (openjdk version "1.8.0_151")
- C (gcc (GCC) 4.8.5)
  - -std=gnu11でコンパイル
- C++ (g++ (GCC) 4.8.5)
  - -std=gnu++1yでコンパイル
- PHP (PHP 5.4.16 (cli))
- Python 2 (Python 2.7.5)
- Python 3 (Python 3.6.3)
  - ソースからビルドしたもの
- Ruby (ruby 2.0.0p648)
- Perl (v5.16.3)
- Go (go version go1.8.3 linux/amd64)
- bash (4.2.46(1)-release)
- Awk (GNU Awk 4.0.2)

Java

import java.io.IOException;

public class Main {
    public static void main(String[] args) throws IOException {
        byte[] buf = new byte[1024];
        int len;
        while ((len = System.in.read(buf)) > 0) {
            System.err.write(buf, 0, len);
        }
    }
}

System.inが標準入力、System.outが標準出力、System.errが標準エラー出力を表す。

ちなみに、System.setIn(InputStream)、System.setOut(PrintStream)、System.setErr(PrintStream)というのがあるので、例えば標準出力や標準エラー出力をファイルに書き出すように切り替えるとかも可能。

C

#include <stdio.h>

int main(int argc, char** argv) {
    char buf[1024];
    while (fgets(buf, sizeof(buf), stdin)) {
        fputs(buf, stderr);
    }
    return 0;
}

FILE*型だと、stdinが標準入力、stdoutが標準出力、stderrが標準エラー出力。

ちなみに、int型のファイルディスクリプタ(fd)だと、0が標準入力、1が標準出力、2が標準エラー出力に関連付いている。 unistd.hを見ると、以下のように書いてある。

/* Standard file descriptors.  */
#define STDIN_FILENO    0       /* Standard input.  */
#define STDOUT_FILENO   1       /* Standard output.  */
#define STDERR_FILENO   2       /* Standard error output.  */

C++

#include <iostream>

using namespace std;

int main(int argc, char** argv) {
    char buf[1024];
    while (cin.getline(buf, sizeof(buf))) {
        cerr << buf << endl;
    }
    return EXIT_SUCCESS;
}

std::cinが標準入力、std::coutが標準出力、std::cerrとstd::clogが標準エラー出力にそれぞれ関連付いている。 std::cerrとstd::clogの違いは、前者がバッファリングされないのに対して、後者がバッファリングされるというもの。 iostreamを見ると、以下のように書いてある。

namespace std _GLIBCXX_VISIBILITY(default)
{
  // (中略)
  extern istream cin;           /// Linked to standard input
  extern ostream cout;          /// Linked to standard output
  extern ostream cerr;          /// Linked to standard error (unbuffered)
  extern ostream clog;          /// Linked to standard error (buffered)

std::clogの存在は、このヘッダファイルを見て初めて知った。

PHP

<?php

$str = file_get_contents('php://stdin');
file_put_contents('php://stderr', $str);

PHPはどちらかというとWebアプリ開発に使われる言語ということで、特に意識せずにprintとかすると標準出力に書き出されるので、標準入出力を意識することは少ないかもしれない。

'php://stdin'が標準入力、'php://stdout'が標準出力、'php://stderr'が標準エラー出力を表すので、これをfile_get_contents / file_put_contentsやfopen等にファイル名と同様に指定してやればよい。

Python 2

import sys

while True:
    line = sys.stdin.readline()
    if line == '':
        break
    sys.stderr.write(line)

sys.stdinが標準入力、sys.stdoutが標準出力、sys.stderrが標準エラー出力。

Python 3

import sys

while True:
    line = sys.stdin.buffer.readline()
    if line == b'':
        break
    sys.stderr.buffer.write(line)

sys.stdinが標準入力、sys.stdoutが標準出力、sys.stderrが標準エラー出力。

Ruby

while line = STDIN.gets
    STDERR.puts line
end

STDINが標準入力、STDOUTが標準出力、STDERRが標準エラー出力。

Perl

while ((my $line = <STDIN>)) {
    print STDERR $line;
}

STDINが標準入力、STDOUTが標準出力、STDERRが標準エラー出力。

Go

package main

import (
    "bufio"
    "io"
    "os"
)

func main() {
    stdin := bufio.NewReader(os.Stdin)
    stderr := bufio.NewWriter(os.Stderr)
    for {
        ch, err := stdin.ReadByte()
        if err == io.EOF {
            break
        }
        stderr.WriteByte(ch)
    }
    stderr.Flush()
}

os.Stdinが標準入力、os.Stdoutが標準出力、os.Stderrが標準エラー出力。

bash

#! /bin/bash

cat <&0 >&2

0が標準入力、1が標準出力、2が標準エラー出力を表すので、標準エラー出力に出したい場合は、>&2のように書いてリダイレクトしてやればよい。

/dev/stdin、/dev/stdout、/dev/stderrを使う手もあるので、>> /dev/stderrのようにリダイレクトしてもよい。

Awk

{
    print >> "/dev/stderr";
}

"/dev/stdin"が標準入力、"/dev/stdout"が標準出力、"/dev/stderr"が標準エラー出力。

まぁ、標準入力や標準出力を明示することはほとんどないけど。

2017-12-01

DateTimeクラスのdiffメソッドの罠

PHP

突然ですが、以下のコードの出力結果はどうなると思いますか？ちなみに「invert」というのは、結果が負の場合に「1」それ以外の場合に「0」になるプロパティ。

<?php
date_default_timezone_set("Asia/Tokyo");

$dt1 = new DateTime("2017-12-01");
$dt2 = new DateTime("2017-11-30");

$diff1 = $dt1->diff($dt2);
$diff2 = date_diff($dt1, $dt2);

var_dump($diff1->invert, $diff2->invert);

$dt1の方が未来なので、直観的にはint(0)になるだろうと思うところだが・・

ドキュメントは以下だけど・・・

PHP: DateTime::diff - Manual

明記されていないが、$dt2 - $dt1の結果になる。すなわち、負になるので、int(1)が出力される。

これは罠過ぎるだろう！！！

なので、DateTimeオブジェクト同士の日時の前後を比較したいだけなら、(PHP 5.2.2以降になるが)比較演算子で比較するようにした方が安全かも。

<?php
date_default_timezone_set("Asia/Tokyo");

$dt1 = new DateTime("2017-12-01");
$dt2 = new DateTime("2017-11-30");

var_dump($dt1 == $dt2);
var_dump($dt1 < $dt2);
var_dump($dt1 <= $dt2);
var_dump($dt1 > $dt2);
var_dump($dt1 >= $dt2);

2017-11-29

いやそれリークしますからっ！

Java 爆弾コード

以下のようなプログラムを見つけて、思わず机を叩き割ろうかと思いましたよ、えぇ‥

Reader reader = null;
try {
    for (int i = 0; i < files.length; ++i) {
        reader = new BufferedReader(new InputStreamReader(new FileInputStream(files[i])));
        // 以下、いろいろ処理（ただしreaderのclose()以外）
    }
} finally {
    if (reader != null) {
        try { reader.close(); } catch (Exception e) {}
    }
}

いや、これだと、最後に開いたファイルのストリームしか閉じないんですけど‥

バッチプログラムで、長くても数分で終了するプログラムだからまだ影響が少なかったが、Webアプリでこれをやったら確実にアウトですよ、アウト‥

参考

IBM [DB2 LUW] Java アプリケーションなどで SQL0805N (-805) エラーが返る (IM-10-00V) - United States

2017-11-28

各言語でコマンドライン引数を扱う

Java C C++ PHP Python Ruby Perl Go bash

そういえばやってなかったなということで、各言語でコマンドライン引数を扱うプログラムを書いてみたメモ。

要件は以下の通り。

コマンドライン引数として、3個以上の文字列を与える
標準出力に、以下を順に出力
- 引数として与えられた文字列を改行区切りで出力
- 引数の個数を出力
- 引数の3番目の文字列を出力

環境

手元にあるものということで、環境は以下のものに限定する。

CentOS 7
- Java (openjdk version "1.8.0_151")
- C (gcc (GCC) 4.8.5)
  - -std=gnu11でコンパイル
- C++ (g++ (GCC) 4.8.5)
  - -std=gnu++1yでコンパイル
- PHP (PHP 5.4.16 (cli))
- Python 2 (Python 2.7.5)
- Python 3 (Python 3.6.3)
  - ソースからビルドしたもの
- Ruby (ruby 2.0.0p648)
- Perl (v5.16.3)
- Go (go version go1.8.3 linux/amd64)
- bash (4.2.46(1)-release)

実行例

[各言語でのコマンド実行] a b c d e

出力例

a
b
c
d
e
5
c

Java

public class Main {
    public static void main(String[] args) {
        for (String s : args) {
            System.out.println(s);
        }
        System.out.println(args.length);
        System.out.println(args[2]);
    }
}

C

#include <stdio.h>

int main(int argc, char** argv) {
    for (int i = 1; i < argc; ++i) {
        printf("%s\n", argv[i]);
    }
    printf("%d\n", argc - 1);
    printf("%s\n", argv[3]);

    return 0;
}

C++

#include <iostream>

using namespace std;

int main(int argc, char** argv) {
    for (int i = 1; i < argc; ++i) {
        cout << argv[i] << endl;
    }
    cout << (argc - 1) << endl;
    cout << argv[3] << endl;

    return EXIT_SUCCESS;
}

PHP

<?php

for ($i = 1; $i < count($argv); ++$i) {
    echo $argv[$i] . PHP_EOL;
}
echo (count($argv) - 1) . PHP_EOL;
echo $argv[3] . PHP_EOL;

Python 2

import sys

for i in range(1, len(sys.argv)):
    print sys.argv[i]
print len(sys.argv) - 1
print sys.argv[3]

Python 3

import sys

for i in range(1, len(sys.argv)):
    print(sys.argv[i])
print(len(sys.argv) - 1)
print(sys.argv[3])

Ruby

for a in ARGV
    print a,"\n"
end
print ARGV.length,"\n"
print ARGV[2],"\n"

Perl

for (my $i = 0; $i < @ARGV; ++$i) {
    print $ARGV[$i],"\n";
}
my $argc = @ARGV;
print $argc,"\n";
print $ARGV[2],"\n";

Go

package main

import (
    "fmt"
    "os"
)

func main() {
    for i := 1; i < len(os.Args); i += 1 {
        fmt.Println(os.Args[i])
    }
    fmt.Println(len(os.Args) - 1)
    fmt.Println(os.Args[3]);
}

bash

#! /bin/bash

for c in "$@" ; do
    echo ${c}
done
echo $#
echo $3

まとめ

それぞれの言語ごとの仕様を表にまとめてみる。

言語	変数名	最初の引数の添え字	個数
Java (*1)	args	0	args.length
C (*1)	argv	1	argc - 1
C++ (*1)	argv	1	argc - 1
PHP	$argv	1	count($argv) - 1
Python 2/3	sys.argv	1	len(sys.argv) - 1
Ruby	ARGV	0	ARGV.length
Perl	$ARGV	0	@ARGV (*3)
Go	os.Args	1	len(os.Args) - 1
bash	$n または ${n} (*2)	1	$#

(*1)変数名はmainメソッド/関数の引数の宣言による。
(*2)「n」は正の整数。
(*3)文脈によっては、「my $argc = @ARGV」などとしてやらないと、個数でなく引数を結合した文字列が得られてしまう。