1. 文字列照合

#include <array>
#include <fstream>
#include <iostream>
#include <print>
#include <string>
 
int main(int argc, char *argv[]) {
  if (argc != 2) {
    std::println("Usage: pmatch <positive integer>");
    return 1;
  }
  int n = std::stoll(argv[1]);
  if (n < 1) {
    std::println("{} is not positive.", n);
    return 1;
  }
  std::ifstream file("input.txt");
  if (!file.is_open()) {
    std::println(stderr, "\"input.txt\" cannot be opened.");
    return 1;
  }
  std::string line, text;
  while (std::getline(file, line))
    text += line + "\n";
  file.close();
  std::array<std::string, 4> words = {"Lincoln",
                                      "Aristotle",
                                      "Laplace",
                                      "Poisson"};
  size_t wordCount = words.size();
  for (int j = 0; j < n; ++j)
    for (int i = 0; i < wordCount; ++i) {
      size_t found = text.find(words[i]);
      if (j == 0) {
        if (found != std::string::npos)
          std::println("{} found at position: {}", words[i], found);
        else
          std::println("{} not found.", words[i]);
      }
    }
  std::println("loop count = {}", n);
  return 0;
}

makefile:

CPPG = g++
CPPL = clang++
CPPFLAGS = -std=c++23 -O2
SRC = pmatch.cpp
EXES = pmatch_g pmatch_l
 
all: $(EXES)
 
pmatch_g: $(SRC)
	$(CPPG) $(CPPFLAGS) -o $@ $(SRC)
 
pmatch_l: $(SRC)
	$(CPPL) $(CPPFLAGS) -o $@ $(SRC)
 
clean:
	rm -f $(EXES)

n = 100000で測定する。

$ measure ./pmatch_g 100000
Lincoln found at position: 4225
Aristotle found at position: 7262
Laplace found at position: 8406
Poisson found at position: 50550
loop count = 100000
======================================
Process exited with status: 0
total time:  0.195227 [sec]
mem  size:       4120 [KB]
code size:        127 [KB]

$ measure ./pmatch_l 100000
Lincoln found at position: 4225
Aristotle found at position: 7262
Laplace found at position: 8406
Poisson found at position: 50550
loop count = 100000
======================================
Process exited with status: 0
total time:  0.200083 [sec]
mem  size:       4104 [KB]
code size:        102 [KB]

OCaml

pmatch.ml:

open Printf
 
let read_file filename =
  try
    let ic = open_in filename in
    let rec read_lines acc =
      match input_line ic with
      | line ->
        read_lines (line :: acc)
      | exception End_of_file ->
        close_in ic;
        List.rev acc
    in
    read_lines []
  with
    | Sys_error msg ->
      print_endline ("Error: " ^ msg);
      exit 1
 
let () =
  let len = Array.length Sys.argv in
  if len <> 2 then
    printf "Usage: pmatch <positive integer>\n"
  else
    let n = int_of_string (Sys.argv.(1)) in
    if n < 1 then
      printf "%d is not positive.\n" n
    else (* ← shift 4 *)
  let lines = read_file "input.txt" in
  let text = String.concat "\n" lines in
  let patterns = ["Lincoln"; "Aristotle"; "Laplace"; "Poisson"] in
  for j = 0 to n - 1 do
    List.iter (fun word ->
      match Str.search_forward (Str.regexp_string word) text 0 with
      | exception Not_found ->
        if j = 0 then
          print_endline (word ^ " not found.")
        else
          ()
      | index ->
        if j = 0 then
          print_endline (word ^ " found at position: " ^ string_of_int index)
        else
          ())
      patterns
  done;
  printf "loop count = %d\n" n

makefile:

OCAMLOPT = ocamlopt
OCAMLFLAGS = -O2
LIBPATH = -I +str
LIBS = str.cmxa
SRC = pmatch.ml
EXES = pmatch
 
all: $(EXES)
 
pmatch: $(SRC)
	$(OCAMLOPT) $(OCAMLFLAGS) $(LIBPATH) -o $@ $(LIBS) $(SRC)
 
clean:
	rm -f $(EXES) *.o *.cmx *.cmi

n = 100000で測定する。

$ measure ./pmatch 100000
Lincoln found at position: 4225
Aristotle found at position: 7262
Laplace found at position: 8406
Poisson found at position: 50550
loop count = 100000
======================================
Process exited with status: 0
total time:  6.636662 [sec]
mem  size:       5700 [KB]
code size:       2616 [KB]

正規表現モジュールStrを使っていることで時間がかかっているかもしれない。単純なパターンマッチングであればもう少し速くなる可能性がある。

Racket

Racketにはバイト文字列の文字列照合機能がないので、バイト文字列を標準文字列(UNICODE)として扱い、srfi/13の文字列照合機能を用いる。

pmatch.rkt:

#lang racket
(require iso-printf)
(require srfi/13)
(require racket/file)
 
(define (pmatch-body n)
  (define input-file "input.txt")
  (if (not (file-exists? input-file))
      (begin
        (printf "\"input.txt\" cannot be opened.\n")
        (exit 1))
      void)
  (define text (file->string input-file)) 
  (define patterns '("Lincoln" "Aristotle" "Laplace" "Poisson"))
  (for ([j (in-range n)])
    (for ([pattern patterns])
      (define pos (string-contains text pattern))
      (when (= j 0)
        (if (integer? pos)
            (printf "%s found at position: %d\n" pattern pos)
            (printf "%s is not found." pattern)))))
  (printf "loop count = %d\n" n))
 
(define (main args)
  (let ([len (vector-length args)]
        [n 0])
    (cond
      [(= len 1)
        (set! n (string->number (vector-ref args 0)))
        (cond
          [(> n 0)
            (pmatch-body n)]
          [else
            (printf "%d is not positive.\n" n)])]
      [else
        (printf "Usage: pmatch <positive integer>\n")])))
 
(main (current-command-line-arguments))

n = 10000で測定する。

$ raco exe pmatch.rkt
$ measure ./pmatch 10000
Lincoln found at position: 4198
Aristotle found at position: 7219
Laplace found at position: 8361
Poisson found at position: 50388
loop count = 10000
======================================
Process exited with status: 0
total time:  9.294210 [sec]
mem  size:     134312 [KB]
code size:      12741 [KB]

UNICODEでパターンマッチングしているRacketはかなり時間がかかる。C++、OCamlとRacketで位置が異なるのはinput.txtは英文テキストであるが、一部にUTF-8符号化されたUNICODEが含まれているからである。C++、OCamlはバイト列として認識し、RacketはUTF-8符号化文字列として認識している。厳密に言えばC++、OCamlは間違っており、Racketは正しい文字列処理をしている。

1.3 UNICODE文字列

課題

9,311バイトのUTF-8テキスト(unicode.txt)を読み込み、次の四つの語：

カルダーノ
デル・フェッロ
フォンタナ
ブール

について文字列照合を行ない、存在した場合にその位置を表示する。これをn回繰り返す。

C++

pmatch_u.cpp:

#include <fstream>
#include <iostream>
#include <print>
#include <string>
#include <unicode/unistr.h>
 
int main(int argc, char *argv[]) {
  if (argc != 2) {
    std::println("Usage: pmatch_u <positive integer>");
    return 1;
  }
  int n = std::stoll(argv[1]);
  if (n < 1) {
    std::println("{} is not positive.", n);
    return 1;
  }
  std::ifstream file("unicode.txt");
  if (!file.is_open()) {
    std::println(stderr, "\"unicode.txt\" cannot be opened.");
    return 1;
  }
  std::string line, u8text;
  while (std::getline(file, line))
    u8text += line + "\n";
  file.close();
  icu::UnicodeString ustr = icu::UnicodeString::fromUTF8(u8text);
  std::array<icu::UnicodeString, 4> words = {
      icu::UnicodeString::fromUTF8("カルダーノ"),
      icu::UnicodeString::fromUTF8("デル・フェッロ"),
      icu::UnicodeString::fromUTF8("フォンタナ"),
      icu::UnicodeString::fromUTF8("ブール")};
  size_t wordCount = words.size();
  for (int j = 0; j < n; ++j)
    for (int i = 0; i < wordCount; ++i) {
      int32_t pos = ustr.indexOf(words[i]);
      if (j == 0) {
        std::string u8Word;
        words[i].toUTF8String(u8Word);
        if (pos != -1)
          std::println("{} found at position: {}", u8Word, pos);
        else
          std::println("{} not found.", u8Word);
      }
    }
  std::println("loop count = {}", n);
  return 0;
}

makefile:

CPPG = g++
CPPL = clang++
CPPFLAGS = -std=c++23 -O2
LIBS = -licuuc
SRC = pmatch_u.cpp
EXES = pmatch_ug pmatch_ul
 
all: $(EXES)
 
pmatch_ug: $(SRC)
	$(CPPG) $(CPPFLAGS) -o $@ $(SRC) $(LIBS)
 
pmatch_ul: $(SRC)
	$(CPPL) $(CPPFLAGS) -o $@ $(SRC) $(LIBS)
 
clean:
	rm -f $(EXES)

n = 100000で測定する。

$ measure ./pmatch_ug 100000
カルダーノ found at position: 10
デル・フェッロ found at position: 221
フォンタナ found at position: 110
ブール found at position: 2125
loop count = 100000
======================================
Process exited with status: 0
total time:  0.164566 [sec]
mem  size:       4876 [KB]
code size:        133 [KB]

$ measure ./pmatch_ul 100000
カルダーノ found at position: 10
デル・フェッロ found at position: 221
フォンタナ found at position: 110
ブール found at position: 2125
loop count = 100000
======================================
Process exited with status: 0
total time:  0.147316 [sec]
mem  size:       4856 [KB]
code size:        103 [KB]

Racket

pmatch.rktでデータを変えただけである。

pmatch_u.rkt:

#lang racket
(require iso-printf)
(require srfi/13)
(require racket/file)
 
(define (pmatch-body n)
  (define input-file "unicode.txt")
  (if (not (file-exists? input-file))
      (begin
        (printf "\"unicode.txt\" cannot be opened.\n")
        (exit 1))
      void)
  (define text (file->string input-file)) 
  (define patterns '("カルダーノ" "デル・フェッロ" "フォンタナ" "ブール"))
  (for ([j (in-range n)])
    (for ([pattern patterns])
      (define pos (string-contains text pattern))
      (when (= j 0)
        (if (integer? pos)
            (printf "%s found at position: %d\n" pattern pos)
            (printf "%s is not found." pattern)))))
  (printf "loop count = %d\n" n))
 
(define (main args)
  (let ([len (vector-length args)]
        [n 0])
    (cond
      [(= len 1)
        (set! n (string->number (vector-ref args 0)))
        (cond
          [(> n 0)
            (pmatch-body n)]
          [else
            (printf "%d is not positive.\n" n)])]
      [else
        (printf "Usage: pmatch_u <positive integer>\n")])))
 
(main (current-command-line-arguments))

n = 100000で測定する。

$ raco exe pmatch_u.rkt
$ measure ./pmatch_u 100000
カルダーノ found at position: 10
デル・フェッロ found at position: 221
フォンタナ found at position: 110
ブール found at position: 2125
loop count = 100000
======================================
Process exited with status: 0
total time:  3.727788 [sec]
mem  size:     139936 [KB]
code size:      12741 [KB]

1.4 まとめ

測定結果を表にまとめる。単位は秒[sec]。

測定項目	GCC	Clang	OCaml	Racket
バイト文字列	0.195	0.200	6.637	-
UNICODE文字列	0.165	0.147	-	3.546

文字列照合に関してはバイト文字列でもUNICODE文字列でもC++が処理性能において優位に立っている。OCamlには標準でUNICODE文字列処理機能は無い。Racketには文字符号化法に関知しないで文字列処理を書けるメリットがあり、プロトタイピングに向いている。