2. 正規表現
著者:梅谷 武
各言語における正規表現の扱いについて調べる。
作成:2024-05-28
更新:2024-11-02
更新:2024-11-02
正規表現の実装にはstd::regexを用いる。
正規表現の実装には正規表現モジュールModule Strを用いる。
正規表現についての概説はThe Racket Guide>9 Regular Expressions、詳細仕様はThe Racket Reference>4 Datatypes>4.8 Regular Expressionsにある。
HTMLファイル(input.html)から正規表現を使って全URLを抽出する。これをn回繰り返す。正規表現はPOSIX標準に準拠する。
rmatch.cpp:
#include <fstream>
#include <iostream>
#include <print>
#include <regex>
#include <string>
#include <vector>
int main(int argc, char *argv[]) {
if (argc != 2) {
std::println("Usage: rmatch <positive integer>");
return 1;
}
int n = std::stoll(argv[1]);
if (n < 1) {
std::println("{} is not positive.", n);
return 1;
}
std::ifstream file("input.html");
if (!file.is_open()) {
std::println(stderr, "\"input.html\" cannot be opened.");
return 1;
}
std::string line, text;
while (std::getline(file, line))
text += line + "\n";
file.close();
std::string pattern = "https?://[\\w/:%#\\$&\\?\\(\\)~\\.=\\+\\-]+";
std::regex re(pattern);
std::smatch matches;
std::vector match_results;
for (int j = 0; j < n; ++j) {
std::string::const_iterator search_start(text.cbegin());
while (std::regex_search(search_start, text.cend(), matches, re)) {
if (j == 0)
match_results.push_back(matches[0].str());
search_start = matches.suffix().first;
}
}
for (const auto &match : match_results)
std::println("{}", match);
std::println("loop count = {}", n);
return 0;
}
makefile:
CPPG = g++
CPPL = clang++
CPPFLAGS = -std=c++23 -O2
SRC = rmatch.cpp
EXES = rmatch_g rmatch_l
all: $(EXES)
rmatch_g: $(SRC)
$(CPPG) $(CPPFLAGS) -o $@ $(SRC)
rmatch_l: $(SRC)
$(CPPL) $(CPPFLAGS) -o $@ $(SRC)
clean:
rm -f $(EXES)
n = 1000で測定する。
$ measure ./rmatch_g 1000
http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd
http://www.w3.org/1999/xhtml
https://cpprefjp.github.io/reference/string/basic_string.html
https://ocaml.org/manual/5.1/api/String.html
https://ocaml.org/manual/5.2/api/Str.html
https://docs.racket-lang.org/guide/characters.html
https://docs.racket-lang.org/reference/characters.html
https://docs.racket-lang.org/guide/strings.html
https://docs.racket-lang.org/reference/strings.html
https://docs.racket-lang.org/srfi/srfi-std/srfi-13.html#string-contains
https://docs.racket-lang.org/reference/Filesystem.html
loop count = 1000
======================================
Process exited with status: 0
total time: 0.392914 [sec]
mem size: 3920 [KB]
code size: 268 [KB]
$ measure ./rmatch_l 1000
http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd
http://www.w3.org/1999/xhtml
https://cpprefjp.github.io/reference/string/basic_string.html
https://ocaml.org/manual/5.1/api/String.html
https://ocaml.org/manual/5.2/api/Str.html
https://docs.racket-lang.org/guide/characters.html
https://docs.racket-lang.org/reference/characters.html
https://docs.racket-lang.org/guide/strings.html
https://docs.racket-lang.org/reference/strings.html
https://docs.racket-lang.org/srfi/srfi-std/srfi-13.html#string-contains
https://docs.racket-lang.org/reference/Filesystem.html
loop count = 1000
======================================
Process exited with status: 0
total time: 0.368251 [sec]
mem size: 4016 [KB]
code size: 255 [KB]
rmatch.ml:
open Printf
let read_file filename =
try
let ic = open_in filename in
let rec read_lines acc =
match input_line ic with
| line ->
read_lines (line :: acc)
| exception End_of_file ->
close_in ic;
List.rev acc
in
read_lines []
with
| Sys_error msg ->
print_endline ("Error: " ^ msg);
exit 1
let get_urls text =
let pattern = Str.regexp "https?://[0-9a-z_/:%#\\$&\\?\\(\\)~\\.=\\+\\-]+" in
let rec get_urls_aux pos acc =
try
let start_pos = Str.search_forward pattern text pos in
let match_str = Str.matched_string text in
get_urls_aux (start_pos + 1) (match_str :: acc)
with Not_found -> List.rev acc
in
get_urls_aux 0 []
let () =
let len = Array.length Sys.argv in
if len <> 2 then
printf "Usage: rmatch <positive integer>\n"
else
let n = int_of_string (Sys.argv.(1)) in
if n < 1 then
printf "%d is not positive.\n" n
else (* ← shift 4 *)
let lines = read_file "input.html" in
let text = String.concat "\n" lines in
for j = 0 to n - 1 do
let matches = get_urls text in
if (j = 0) then
List.iter (printf "%s\n") matches
else
()
done;
printf "loop count = %d\n" n
makefile:
OCAMLOPT = ocamlopt
OCAMLFLAGS = -O2
LIBPATH = -I +str
LIBS = str.cmxa
SRC = rmatch.ml
EXES = rmatch
all: $(EXES)
rmatch: $(SRC)
$(OCAMLOPT) $(OCAMLFLAGS) $(LIBPATH) -o $@ $(LIBS) $(SRC)
clean:
rm -f $(EXES) *.o *.cmx *.cmi
n = 1000で測定する。
$ measure ./rmatch 1000
http://www.w3.org/
http://www.w3.org/1999/xhtml
https://cpprefjp.github.io/reference/string/basic_string.html
https://ocaml.org/manual/5.1/api/
https://ocaml.org/manual/5.2/api/
https://docs.racket-lang.org/guide/characters.html
https://docs.racket-lang.org/reference/characters.html
https://docs.racket-lang.org/guide/strings.html
https://docs.racket-lang.org/reference/strings.html
https://docs.racket-lang.org/srfi/srfi-std/srfi-13.html#string-contains
https://docs.racket-lang.org/reference/
loop count = 1000
======================================
Process exited with status: 0
total time: 0.027481 [sec]
mem size: 5644 [KB]
code size: 2616 [KB]
POSIX準拠ではなく使える構文が少ないので正規表現パターンをC++、Racketと変えている。その代わりにC++やRacketと比べると一桁速い。
rmatch.rkt:
#lang racket
(require iso-printf)
(require racket/file)
(define (rmatch-body n)
(define input-file "input.html")
(if (not (file-exists? input-file))
(begin
(printf "\"input.html\" cannot be opened.\n")
(exit 1))
void)
(define text (file->string input-file))
(define pattern #px"https?://[\\w/:%#\\$&\\?\\(\\)~\\.=\\+\\-]+")
(for ([j (in-range n)])
(define matches (regexp-match* pattern text))
(when (= j 0)
(for ([match matches])
(printf "%s\n" match))))
(printf "loop count = %d\n" n))
(define (main args)
(let ([len (vector-length args)]
[n 0])
(cond
[(= len 1)
(set! n (string->number (vector-ref args 0)))
(cond
[(> n 0)
(rmatch-body n)]
[else
(printf "%d is not positive.\n" n)])]
[else
(printf "Usage: rmatch <positive integer>\n")])))
(main (current-command-line-arguments))
n = 1000で測定する。
$ raco exe rmatch.rkt
$ measure ./rmatch 1000
http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd
http://www.w3.org/1999/xhtml
https://cpprefjp.github.io/reference/string/basic_string.html
https://ocaml.org/manual/5.1/api/String.html
https://ocaml.org/manual/5.2/api/Str.html
https://docs.racket-lang.org/guide/characters.html
https://docs.racket-lang.org/reference/characters.html
https://docs.racket-lang.org/guide/strings.html
https://docs.racket-lang.org/reference/strings.html
https://docs.racket-lang.org/srfi/srfi-std/srfi-13.html#string-contains
https://docs.racket-lang.org/reference/Filesystem.html
loop count = 1000
======================================
Process exited with status: 0
total time: 1.138056 [sec]
mem size: 134004 [KB]
code size: 12443 [KB]
日本語テキストファイル(kanji.txt)から正規表現を使って全数値表記を抽出する。これをn回繰り返す。正規表現はPOSIX標準に準拠する。
rmatch_u.cpp:
#include <fstream>
#include <iostream>
#include <print>
#include <string>
#include <unicode/regex.h>
#include <unicode/unistr.h>
#include <vector>
int main(int argc, char *argv[]) {
if (argc != 2) {
std::println("Usage: rmatch_u <positive integer>");
return 1;
}
int n = std::stoll(argv[1]);
if (n < 1) {
std::println("{} is not positive.", n);
return 1;
}
std::ifstream file("kanji.txt");
if (!file.is_open()) {
std::println(stderr, "\"kanji.txt\" cannot be opened.");
return 1;
}
std::string line, u8text;
while (std::getline(file, line))
u8text += line + "\n";
file.close();
icu::UnicodeString text = icu::UnicodeString::fromUTF8(u8text);
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeString rstr = icu::UnicodeString::fromUTF8(
"[0-90-9〇一二三四五六七八九十百千万億兆]+");
icu::RegexPattern *pattern = icu::RegexPattern::compile(rstr, 0, status);
if (U_FAILURE(status)) {
std::println(stderr, "Error pattern: {}", u_errorName(status));
return 1;
}
icu::RegexMatcher *matcher = pattern->matcher(text, status);
if (U_FAILURE(status)) {
std::println(stderr, "Error matcher: {}", u_errorName(status));
delete pattern;
return 1;
}
std::vector matches;
for (int j = 0; j < n; ++j) {
matches.clear();
while (matcher->find(status) && U_SUCCESS(status))
matches.push_back(matcher->group(status));
if (U_FAILURE(status))
std::println(stderr, "Error find: {}", u_errorName(status));
if (j == 0)
std::println("match count = {}", matches.size());
}
std::println("loop count = {}", n);
delete matcher;
delete pattern;
return 0;
}
makefile:
CPPG = g++
CPPL = clang++
CPPFLAGS = -std=c++23 -O2
LIBS = -licuuc -licui18n
SRC = rmatch_u.cpp
EXES = rmatch_ug rmatch_ul
all: $(EXES)
rmatch_ug: $(SRC)
$(CPPG) $(CPPFLAGS) -o $@ $(SRC) $(LIBS)
rmatch_ul: $(SRC)
$(CPPL) $(CPPFLAGS) -o $@ $(SRC) $(LIBS)
clean:
rm -f $(EXES)
n = 1000000で測定する。
$ measure ./rmatch_ug 1000000
match count = 175
loop count = 1000000
======================================
Process exited with status: 0
total time: 0.016071 [sec]
mem size: 7192 [KB]
code size: 132 [KB]
$ measure ./rmatch_ul 1000000
match count = 175
loop count = 1000000
======================================
Process exited with status: 0
total time: 0.016368 [sec]
mem size: 7172 [KB]
code size: 102 [KB]
rmatch_u.rkt:
#lang racket
(require iso-printf)
(require racket/file)
(define (rmatch-body n)
(define input-file "kanji.txt")
(if (not (file-exists? input-file))
(begin
(printf "\"kanji.txt\" cannot be opened.\n")
(exit 1))
void)
(define text (file->string input-file))
(define pattern #px"[0-90-9〇一二三四五六七八九十百千万億兆]+")
(for ([j (in-range n)])
(define matches (regexp-match* pattern text))
(when (= j 0)
(printf "match count = %d\n" (length matches))))
(printf "loop count = %d\n" n))
(define (main args)
(let ([len (vector-length args)]
[n 0])
(cond
[(= len 1)
(set! n (string->number (vector-ref args 0)))
(cond
[(> n 0)
(rmatch-body n)]
[else
(printf "%d is not positive.\n" n)])]
[else
(printf "Usage: rmatch <positive integer>\n")])))
(main (current-command-line-arguments))
n = 1000で測定する。
$ measure ./rmatch_u 1000
match count = 175
loop count = 1000
======================================
Process exited with status: 0
total time: 3.963383 [sec]
mem size: 134404 [KB]
code size: 12442 [KB]
C++よりかなり遅い。
測定結果を表にまとめる。単位は秒[sec]。
測定項目 | GCC | Clang | OCaml | Racket |
---|---|---|---|---|
正規表現(バイト) | 0.393 | 0.368 | 0.027 | 1.138 |
正規表現(UNICODE) | 0.016 | 0.016 | - | - |
バイト文字列の正規表現はOCamlが速いがPOSIX準拠ではなく使える構文は少ない。UNICODE文字列の正規表現はC++が速い。Racketは特にUNICODEで正規表現が遅い。
© 2024 Takeshi Umetani