HHeLiBeXの日記正道編

日々の記憶の記録とメモ‥

各言語で部分文字列を取得してみる

Java C C++ PHP Python Ruby Perl Go bash Awk

各言語で入力された文字列の部分文字列を取得するプログラムを書いてみたメモ。

要件は以下の通り。

標準入力から、1行の文字列が与えられる
- 文字エンコーディングはUTF-8
- サロゲートペアも含まれることがある
- 文字数は3文字以上であることが保証される
入力文字列の部分文字列「[2, 4)」(つまり2～3文字目からなる文字列)を抽出
標準出力に、抽出した文字列を出力

環境

手元にあるものということで、環境は以下のものに限定する。

CentOS 7
- Java (openjdk version "1.8.0_151")
- C (gcc (GCC) 4.8.5)
  - -std=gnu11でコンパイル
- C++ (g++ (GCC) 4.8.5)
  - -std=gnu++1yでコンパイル
- PHP (PHP 5.4.16 (cli))
- Python 2 (Python 2.7.5)
- Python 3 (Python 3.6.3)
  - ソースからビルドしたもの
- Ruby (ruby 2.0.0p648)
- Perl (v5.16.3)
- Go (go version go1.8.3 linux/amd64)
- bash (4.2.46(1)-release)
- Awk (GNU Awk 4.0.2)

Java

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;

public class Main {
    /**
     * サロゲートを考慮したsubstring
     */
    private static String substring(String s, int startIndex, int endIndex) {
        StringBuilder sb = new StringBuilder();

        if (startIndex < 0) {
            throw new StringIndexOutOfBoundsException(startIndex);
        }
        int cpCount = s.codePointCount(0, s.length());
        if (cpCount < endIndex) {
            throw new StringIndexOutOfBoundsException(endIndex);
        }
        int subLen = endIndex - startIndex;
        if (subLen < 0) {
            throw new StringIndexOutOfBoundsException(subLen);
        }

        int idx = 0;
        for (int i = 0; i < s.length() && idx < endIndex; ++i) {
            char ch1 = s.charAt(i);
            if (startIndex <= idx && idx < endIndex) {
                sb.append(ch1);
            }
            if (Character.isSurrogate(ch1)) {
                char ch2 = s.charAt(++i);
                if (startIndex <= idx && idx < endIndex) {
                    sb.append(ch2);
                }
            }
            ++idx;
        }

        return sb.toString();
    }

    public static void main(String[] args) {
        try (BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
            PrintWriter out = new PrintWriter(System.out)
        ) {
            String s = in.readLine();

            out.println(substring(s, 1, 3));
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

サロゲートペアを考慮すると、Javaでは2つのchar値でサロゲートペアを表すことになるので、部分文字列を抽出する処理に一番手間がかかった。

C

#include <stdio.h>
#include <string.h>
#include <locale.h>
#include <wchar.h>
#include <stdlib.h>

int main(int argc, char** argv) {
    setlocale(LC_ALL, "ja_JP.UTF-8");

    char str[1024];

    fgets(str, sizeof(str), stdin);
    while (str[strlen(str) - 1] == '\n' || str[strlen(str) - 1] == '\r') {
        str[strlen(str) - 1] = '\0';
    }

    wchar_t buf[1024];
    const char* p = str;
    mbsrtowcs(buf, &p, sizeof(buf), NULL);

    wchar_t wstr[3];
    memset(wstr, 0, sizeof(wstr));
    // 「2」は言うまでもなく、indexではなくlength
    wcsncpy(wstr, &buf[1], 2);
    fwprintf(stdout, L"%ls\n", wstr);

    return 0;
}

C++

#include <iostream>
#include <locale>
#include <string>
#include <boost/regex.hpp>

using namespace std;

int main(int argc, char** argv) {
    setlocale(LC_ALL, "ja_JP.UTF-8");
    wcout.imbue(locale("japanese"));

    wstring str;
    getline(wcin, str);

    // 「2」はindexではなくlengthであることに注意
    str = str.substr(1, 2);
    wcout << str << endl;

    return EXIT_SUCCESS;
}

PHP

<?php

$str = file_get_contents('php://stdin');
$str = preg_replace("/[\r\n]/", '', $str);

// 「2」はindexではなくlengthであることに注意
echo mb_substr($str, 1, 2, 'UTF-8') . PHP_EOL;

Python 2

import sys

s = sys.stdin.readline()
ustr = unicode(s, 'UTF-8')
ustr = ustr.replace('\n', '')
ustr = ustr.replace('\r', '')

print ustr[1:3].encode('UTF-8')

Python 3

import sys

b = sys.stdin.buffer.readline()
s = str(b, 'UTF-8')
s = s.replace('\n', '')
s = s.replace('\r', '')

print(s[1:3])

Ruby

str = STDIN.gets
str.chomp!()

# 「2」はindexではなくlengthであることに注意
print str[1, 2],"\n"

Perl

use Encode;

my $str = readline(STDIN);
chomp($str);

my $ustr = decode('UTF-8', $str);
# 「2」はindexではなくlengthであることに注意
print encode('UTF-8', substr($ustr, 1, 2)),"\n";

Go

package main

import (
    "fmt"
    "os"
    "io"
    "bufio"
)

func ReadLine(reader *bufio.Reader) (s string, err error) {
    prefix := false
    buf := make([]byte, 0)
    var line []byte
    for {
        line, prefix, err = reader.ReadLine()
        if err == io.EOF {
            return
        }
        buf = append(buf, line...)
        if prefix {
            continue
        }
        s = string(buf)
        return
    }
}

func main() {
    stdin := bufio.NewReader(os.Stdin)
    s, _ := ReadLine(stdin)

    runes := []rune(s)
    fmt.Println(string(runes[1:3]))
}

bash

#! /bin/bash

IFS= read s

echo "${s}" | sed -e 's/^.\(..\).*$/\1/g'

Awk

{
    gsub(/[\r\n]/, "");
    # 第3パラメータの「2」はindexではなくlengthであることに注意
    print substr($0, 2, 2);
}