2. 正規表現

著者:梅谷武

各言語における正規表現の扱いについて調べる。

作成:2024-05-28
更新:2024-11-02

2.1 言語仕様

C++

正規表現の実装にはstd::regexを用いる。

OCaml

正規表現の実装には正規表現モジュールModule Strを用いる。

Racket

正規表現についての概説はThe Racket Guide＞9 Regular Expressions、詳細仕様はThe Racket Reference＞4 Datatypes＞4.8 Regular Expressionsにある。

2.2 バイト文字列

課題

HTMLファイル(input.html)から正規表現を使って全URLを抽出する。これをn回繰り返す。正規表現はPOSIX標準に準拠する。

C++

rmatch.cpp:

#include <fstream>
#include <iostream>
#include <print>
#include <regex>
#include <string>
#include <vector>
 
int main(int argc, char *argv[]) {
  if (argc != 2) {
    std::println("Usage: rmatch <positive integer>");
    return 1;
  }
  int n = std::stoll(argv[1]);
  if (n < 1) {
    std::println("{} is not positive.", n);
    return 1;
  }
  std::ifstream file("input.html");
  if (!file.is_open()) {
    std::println(stderr, "\"input.html\" cannot be opened.");
    return 1;
  }
  std::string line, text;
  while (std::getline(file, line))
    text += line + "\n";
  file.close();
  std::string pattern = "https?://[\\w/:%#\\$&\\?\\(\\)~\\.=\\+\\-]+";
  std::regex re(pattern);
  std::smatch matches;
  std::vector match_results;
  for (int j = 0; j < n; ++j) {
    std::string::const_iterator search_start(text.cbegin());
    while (std::regex_search(search_start, text.cend(), matches, re)) {
      if (j == 0)
        match_results.push_back(matches[0].str());
      search_start = matches.suffix().first;
    }
  }
  for (const auto &match : match_results)
    std::println("{}", match);
  std::println("loop count = {}", n);
  return 0;
}

makefile:

CPPG = g++
CPPL = clang++
CPPFLAGS = -std=c++23 -O2
SRC = rmatch.cpp
EXES = rmatch_g rmatch_l
 
all: $(EXES)
 
rmatch_g: $(SRC)
	$(CPPG) $(CPPFLAGS) -o $@ $(SRC)
 
rmatch_l: $(SRC)
	$(CPPL) $(CPPFLAGS) -o $@ $(SRC)
 
clean:
	rm -f $(EXES)

n = 1000で測定する。

$ measure ./rmatch_g 1000
http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd
http://www.w3.org/1999/xhtml
https://cpprefjp.github.io/reference/string/basic_string.html
https://ocaml.org/manual/5.1/api/String.html
https://ocaml.org/manual/5.2/api/Str.html
https://docs.racket-lang.org/guide/characters.html
https://docs.racket-lang.org/reference/characters.html
https://docs.racket-lang.org/guide/strings.html
https://docs.racket-lang.org/reference/strings.html
https://docs.racket-lang.org/srfi/srfi-std/srfi-13.html#string-contains
https://docs.racket-lang.org/reference/Filesystem.html
loop count = 1000
======================================
Process exited with status: 0
total time:  0.392914 [sec]
mem  size:       3920 [KB]
code size:        268 [KB]

$ measure ./rmatch_l 1000
http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd
http://www.w3.org/1999/xhtml
https://cpprefjp.github.io/reference/string/basic_string.html
https://ocaml.org/manual/5.1/api/String.html
https://ocaml.org/manual/5.2/api/Str.html
https://docs.racket-lang.org/guide/characters.html
https://docs.racket-lang.org/reference/characters.html
https://docs.racket-lang.org/guide/strings.html
https://docs.racket-lang.org/reference/strings.html
https://docs.racket-lang.org/srfi/srfi-std/srfi-13.html#string-contains
https://docs.racket-lang.org/reference/Filesystem.html
loop count = 1000
======================================
Process exited with status: 0
total time:  0.368251 [sec]
mem  size:       4016 [KB]
code size:        255 [KB]

OCaml

rmatch.ml:

open Printf
 
let read_file filename =
  try
    let ic = open_in filename in
    let rec read_lines acc =
      match input_line ic with
      | line ->
        read_lines (line :: acc)
      | exception End_of_file ->
        close_in ic;
        List.rev acc
    in
    read_lines []
  with
    | Sys_error msg ->
      print_endline ("Error: " ^ msg);
      exit 1
 
let get_urls text =
  let pattern = Str.regexp "https?://[0-9a-z_/:%#\\$&\\?\\(\\)~\\.=\\+\\-]+" in
  let rec get_urls_aux pos acc =
    try
      let start_pos = Str.search_forward pattern text pos in
      let match_str = Str.matched_string text in
      get_urls_aux (start_pos + 1) (match_str :: acc)
    with Not_found -> List.rev acc
  in
  get_urls_aux 0 []
 
let () =
  let len = Array.length Sys.argv in
  if len <> 2 then
    printf "Usage: rmatch <positive integer>\n"
  else
    let n = int_of_string (Sys.argv.(1)) in
    if n < 1 then
      printf "%d is not positive.\n" n
    else (* ← shift 4 *)
  let lines = read_file "input.html" in
  let text = String.concat "\n" lines in
  for j = 0 to n - 1 do
    let matches = get_urls text in
    if (j = 0) then
      List.iter (printf "%s\n") matches
    else
      ()
  done;
  printf "loop count = %d\n" n

makefile:

OCAMLOPT = ocamlopt
OCAMLFLAGS = -O2
LIBPATH = -I +str
LIBS = str.cmxa
SRC = rmatch.ml
EXES = rmatch
 
all: $(EXES)
 
rmatch: $(SRC)
	$(OCAMLOPT) $(OCAMLFLAGS) $(LIBPATH) -o $@ $(LIBS) $(SRC)
 
clean:
	rm -f $(EXES) *.o *.cmx *.cmi

n = 1000で測定する。

$ measure ./rmatch 1000
http://www.w3.org/
http://www.w3.org/1999/xhtml
https://cpprefjp.github.io/reference/string/basic_string.html
https://ocaml.org/manual/5.1/api/
https://ocaml.org/manual/5.2/api/
https://docs.racket-lang.org/guide/characters.html
https://docs.racket-lang.org/reference/characters.html
https://docs.racket-lang.org/guide/strings.html
https://docs.racket-lang.org/reference/strings.html
https://docs.racket-lang.org/srfi/srfi-std/srfi-13.html#string-contains
https://docs.racket-lang.org/reference/
loop count = 1000
======================================
Process exited with status: 0
total time:  0.027481 [sec]
mem  size:       5644 [KB]
code size:       2616 [KB]

POSIX準拠ではなく使える構文が少ないので正規表現パターンをC++、Racketと変えている。その代わりにC++やRacketと比べると一桁速い。

Racket

rmatch.rkt:

#lang racket
(require iso-printf)
(require racket/file)
 
(define (rmatch-body n)
  (define input-file "input.html")
  (if (not (file-exists? input-file))
      (begin
        (printf "\"input.html\" cannot be opened.\n")
        (exit 1))
      void)
  (define text (file->string input-file)) 
  (define pattern #px"https?://[\\w/:%#\\$&\\?\\(\\)~\\.=\\+\\-]+")
  (for ([j (in-range n)])
    (define matches (regexp-match* pattern text))
    (when (= j 0)
      (for ([match matches])
        (printf "%s\n" match))))
  (printf "loop count = %d\n" n))
 
(define (main args)
  (let ([len (vector-length args)]
        [n 0])
    (cond
      [(= len 1)
        (set! n (string->number (vector-ref args 0)))
        (cond
          [(> n 0)
            (rmatch-body n)]
          [else
            (printf "%d is not positive.\n" n)])]
      [else
        (printf "Usage: rmatch <positive integer>\n")])))
 
(main (current-command-line-arguments))

n = 1000で測定する。

$ raco exe rmatch.rkt
$ measure ./rmatch 1000
http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd
http://www.w3.org/1999/xhtml
https://cpprefjp.github.io/reference/string/basic_string.html
https://ocaml.org/manual/5.1/api/String.html
https://ocaml.org/manual/5.2/api/Str.html
https://docs.racket-lang.org/guide/characters.html
https://docs.racket-lang.org/reference/characters.html
https://docs.racket-lang.org/guide/strings.html
https://docs.racket-lang.org/reference/strings.html
https://docs.racket-lang.org/srfi/srfi-std/srfi-13.html#string-contains
https://docs.racket-lang.org/reference/Filesystem.html
loop count = 1000
======================================
Process exited with status: 0
total time:  1.138056 [sec]
mem  size:     134004 [KB]
code size:      12443 [KB]

2.3 UNICODE文字列

課題

日本語テキストファイル(kanji.txt)から正規表現を使って全数値表記を抽出する。これをn回繰り返す。正規表現はPOSIX標準に準拠する。

C++

rmatch_u.cpp:

#include <fstream>
#include <iostream>
#include <print>
#include <string>
#include <unicode/regex.h>
#include <unicode/unistr.h>
#include <vector>
 
int main(int argc, char *argv[]) {
  if (argc != 2) {
    std::println("Usage: rmatch_u <positive integer>");
    return 1;
  }
  int n = std::stoll(argv[1]);
  if (n < 1) {
    std::println("{} is not positive.", n);
    return 1;
  }
  std::ifstream file("kanji.txt");
  if (!file.is_open()) {
    std::println(stderr, "\"kanji.txt\" cannot be opened.");
    return 1;
  }
  std::string line, u8text;
  while (std::getline(file, line))
    u8text += line + "\n";
  file.close();
  icu::UnicodeString text = icu::UnicodeString::fromUTF8(u8text);
  UErrorCode status = U_ZERO_ERROR;
  icu::UnicodeString rstr = icu::UnicodeString::fromUTF8(
      "[0-9０-９〇一二三四五六七八九十百千万億兆]+");
  icu::RegexPattern *pattern = icu::RegexPattern::compile(rstr, 0, status);
  if (U_FAILURE(status)) {
    std::println(stderr, "Error pattern: {}", u_errorName(status));
    return 1;
  }
  icu::RegexMatcher *matcher = pattern->matcher(text, status);
  if (U_FAILURE(status)) {
    std::println(stderr, "Error matcher: {}", u_errorName(status));
    delete pattern;
    return 1;
  }
  std::vector matches;
  for (int j = 0; j < n; ++j) {
    matches.clear();
    while (matcher->find(status) && U_SUCCESS(status))
      matches.push_back(matcher->group(status));
    if (U_FAILURE(status))
      std::println(stderr, "Error find: {}", u_errorName(status));
    if (j == 0)
      std::println("match count = {}", matches.size());
  }
  std::println("loop  count = {}", n);
  delete matcher;
  delete pattern;
  return 0;
}

makefile:

CPPG = g++
CPPL = clang++
CPPFLAGS = -std=c++23 -O2
LIBS = -licuuc -licui18n
SRC = rmatch_u.cpp
EXES = rmatch_ug rmatch_ul
 
all: $(EXES)
 
rmatch_ug: $(SRC)
	$(CPPG) $(CPPFLAGS) -o $@ $(SRC) $(LIBS)
 
rmatch_ul: $(SRC)
	$(CPPL) $(CPPFLAGS) -o $@ $(SRC) $(LIBS)
 
clean:
	rm -f $(EXES)

n = 1000000で測定する。

$ measure ./rmatch_ug 1000000
match count = 175
loop  count = 1000000
======================================
Process exited with status: 0
total time:  0.016071 [sec]
mem  size:       7192 [KB]
code size:        132 [KB]

$ measure ./rmatch_ul 1000000
match count = 175
loop  count = 1000000
======================================
Process exited with status: 0
total time:  0.016368 [sec]
mem  size:       7172 [KB]
code size:        102 [KB]

Racket

rmatch_u.rkt:

#lang racket
(require iso-printf)
(require racket/file)
 
(define (rmatch-body n)
  (define input-file "kanji.txt")
  (if (not (file-exists? input-file))
    (begin
      (printf "\"kanji.txt\" cannot be opened.\n")
      (exit 1))
    void)
  (define text (file->string input-file)) 
  (define pattern #px"[0-9０-９〇一二三四五六七八九十百千万億兆]+")
  (for ([j (in-range n)])
    (define matches (regexp-match* pattern text))
    (when (= j 0)
      (printf "match count = %d\n" (length matches))))
  (printf "loop count = %d\n" n))
 
(define (main args)
  (let ([len (vector-length args)]
        [n 0])
    (cond
      [(= len 1)
        (set! n (string->number (vector-ref args 0)))
        (cond
          [(> n 0)
            (rmatch-body n)]
          [else
            (printf "%d is not positive.\n" n)])]
      [else
        (printf "Usage: rmatch <positive integer>\n")])))
 
(main (current-command-line-arguments))

n = 1000で測定する。

$ measure ./rmatch_u 1000
match count = 175
loop count = 1000
======================================
Process exited with status: 0
total time:  3.963383 [sec]
mem  size:     134404 [KB]
code size:      12442 [KB]

C++よりかなり遅い。

2.4 まとめ

測定結果を表にまとめる。単位は秒[sec]。

測定項目	GCC	Clang	OCaml	Racket
正規表現(バイト)	0.393	0.368	0.027	1.138
正規表現(UNICODE)	0.016	0.016	-	-

バイト文字列の正規表現はOCamlが速いがPOSIX準拠ではなく使える構文は少ない。UNICODE文字列の正規表現はC++が速い。Racketは特にUNICODEで正規表現が遅い。