Perl ޸

Codeordie


ž

ó

  • Perl5 () ϰ ֽϴ. ,perl ϰ ֱ , jperl δٰ ϴ ϴ.
  • perl ũƮ EUC-JP ׸ ϰ ֽϴ.
  • CGI ϸƮ 䡤FAQ , ޸ Դϴ. ٸ, CGI Ưȭ ƴմϴ.
  • ַ ( ũ) / Perl / WWW ũ ֽϴ.
  • ִ ũƮ, åӿ ּ. , ϰ ֱ , ̿ Ǵ () ֽʽÿ.
  • ִ ũƮ ̿롤 Դϴ. 𿣰 URI( http://www.din.or.jp/~ohzaki/perl.htm )() μ ֽø Դϴ().
  • Internet Explorer 5 Netscape Communicator 4.75 ׸ ǥ Ȯ ǽϰ ֽϴ. ̰͵ ̿ Ͻô , ǥõ 𸨴ϴ.
  • ǰߡ ohzaki@din.or.jp Źմϴ. ̷ , ʾ, , ׺ġ, ̰ͼ, ٸ ֽϴ.
  • ũ ּ ϴ. URI http://www.din.or.jp/~ohzaki/perl.htm Դϴ.
  • ο Ǵ ϴ , μ URI( http://www.din.or.jp/~ohzaki/perl.htm )() ּ. URI ϴ 쿡 㰡 ʿ ϴٸ, Ŀ ֽø ޴ϴ. URI ʴ 쿡 㰡 ο Ǵ ϴ մϴ.
ž

Ÿ ( ) Ѵ

sub my_flock {
  my %lfh = (dir => '. /lockdir/', basename => 'lockfile',
	     timeout => 60, trytime => 10, @_);
  $lfh{path} = $lfh{dir} .  $lfh{basename};

  for (my $i = 0; $i < $lfh{trytime}; $i++, sleep 1) {
    return \%lfh if (rename($lfh{path}, $lfh{current} = $lfh{path} .  time));
  }
  opendir(LOCKDIR, $lfh{dir});
  my @filelist = readdir(LOCKDIR);
  closedir(LOCKDIR);
  foreach (@filelist) {
    if (/^$lfh{basename}(\d+)/) {
      return \%lfh if (time - $1 > $lfh{timeout} and
	  rename($lfh{dir} .  $_, $lfh{current} = $lfh{path} .  time));
      last;
    }
  }
  undef;
}

sub my_funlock {
  rename($_[0]->{current}, $_[0]->{path});
}

# ٴ(Ÿ ƿ ־)
$lfh = my_flock() or die 'Busy! ';

# ũ Ѵ
my_funlock($lfh);

μ ÿ ִ 1 а ɼ () ִ ,Ÿ ȵ˴ϴ. Ÿ  ϴ ΰ ֽϴٸ, ũƮ ħ ٰϰ ֽϴ.

  1.  ÷̶ ִ
  2. ̻ ¸ ȸ ִ

Ÿ  ϴ μ flock Լ symlink Լ ϴ ֽϴٸ, ̷ Լ ÷ 󼭴 Ʈǰ ʽϴ. , 1 ä () ̷ ϴ. ̿ μ, mkdir Լ ϴ rename Լ ˴ϴ. 2 ()Դϴٸ, ̻ ¶, μ ä  ׾ 쿡, ʰ Դϴ. flock ϰ ִ , · μ ׾ ڵ DZ , ̻ ´ ڽ . ׷, symlink mkdir, rename ϴ 쿡 ũƮ ó ʿϰ ˴ϴ.

ü  óұԴϴٸ, ° ִ ð ־ 쿡 ̶̻ Ǵ, ٸ μ ¸ ص Ϳ . ⿡ մϴ. Ÿ  ϴ μ symlink mkdir, rename ϴ? װ ̷ Լ,  ׽Ʈ ״ ÿ ִ atomic Լ̱ Դϴ. ̾߱⸦ ǵ, ̻ ¸ մϴ. , mkdir , ̻ ϴµ, ũƮ ˴ϴ.

rmdir($lockdir) if (time - (stat($lockdir))[9] > 60);

° 60̻ ϰ ־ 쿡 ϸ() ϴ ũƮԴϴٸ, ̰ symlink () mkdir, rename ޸,  Ǵܰ ϴ ÿ ϰ ִ ƴϴٰ ϴ ˴ϴ. ü ϸ(), µ ִ ϴ Դϴ. װ Դϴ.

μ Aμ Bμ C
̻ Ǵ̻ Ǵ

μ ° ̻ϴٶ Ǵ, 1 Ϳ , ٸ μ ׾ΰ ʰ, ° ̻ϴٶ Ǵ μ Ǿ ɼ ֽϴ.

, ̻ ¸ ϴ µ ϰ Ǵ Ϳ ֽϴ. ݴ ϸ, ̻ ¸ ϴ ۿ ¸ Դϴ.  ? ° ׻ ȭ ׷ٰ ϴ Դϴ. ׸, ̰ ϴµ rename ˴ϴ.

ũƮ ϸ, lockfile Ѵ ̸ ǰ ִ ·,lockfile987654321 ڷ ۼ ð ° ° ˴ϴ. ̷ ϴ ,μ B μ C Ǿ ȴٰ ϴ Ȳ ȸ ֽϴ. ֳϸ,μ C rename ũ ̸ μ B ˰ ִ ̸ ̰ Դϴ. ũƮ ϴ ϴ ƴ϶, ̻ ¸ ϸ鼭, ο · Ű ֽϴ.

ũƮ μ, ̸ 丮 غ δ , 丮 Ӽ ٿ δ , dir / ͸ ٿ δ ()Դϴ. $lfh = my_flock(basename => 'lockfileA'); () ȣϴ Ķ͸ ֽϴ. ,my_flock() () (Ÿ ƿ)ϸ() undef ݴϴ. ϰ 쿡 ϴ.

# ٴ(Ÿ ƿ )
1 while (not defined($lfh = my_flock()));

, о鿩, װ ϴ Ÿ Ӵϴ.

  1. ٴ
  2. оδ
  3. ϽϿ Ѵ
  4. Ͻ Ͽ rename Ѵ
  5. ũ Ѵ
ž

ǥѴ

#  $file    ǥѴ

$bufsize = 1024;
open(FILE, "< $file");
binmode(FILE);
$size = (-s FILE) / $bufsize;
$pos += $size <=> ($pos = int($size));
while ($pos--) {
  seek(FILE, $bufsize * $pos, 0);  
  read(FILE, $buf, $bufsize);
  $buf . = $buf_tmp;
  ($buf_tmp, @lines) = $buf =~ /[^\x0D\x0A]*\x0D? \x0A? /g;
  pop(@lines);
  foreach (reverse @lines) {
    print $_;
    print "\n" if $_ ! ~ /[\x0D\x0A]$/;
  }
}
close(FILE);
print $buf_tmp;

ũƮ $bufsize Ʈΰ о鿩 ǥϹǷ, ü о̴ ޸ ų ֽϴ.

$size Թ ִ -s () ׽Ʈ 1  ݴϴ. $pos$bufsize ׸ Ե˴ϴ. ؼ ڸ δ´ ּ. while $pos ȸ о鿩 óѴٰ ϴ ϰ ֽϴ.

$buf $bufsize Ʈΰ о Ϻΰ Ե˴ϴ. ǥѴ (), 켱 $buf ʿ䰡 ֽϴ. װ ϰ ִ $buf =~ /[^\x0D\x0A]*\x0D? \x0A? /g; κ ˴ϴ. ǥ, ڵ ̿ ڰ 0 ̻ ӵǾ, ڵ Ÿ ֽϴ. , ̰ 1 ִ Դϴ. ڵ ̿ ڰ 0 ̻ ࿡ մϴ. , ڵ κ ǥ \x0D? \x0A? () ڵ尡 \x0D\x0A \x0D \x0A ̶ , İ ̾ 쿡 մϴ. ̾߱ 𸨴ϴٸ, 1п ϴ ǥ, ڿ մϴ. ׸, װ ݵ $buf ķ Ű ڰ ƹ͵ · Ͼϴ. , ǹ ڿ ϱ ؼ, pop(@lines); ϰ ֽϴ.

$buf split Լ , split(/\x0D\x0A|\x0D|\x0A/, $buf); ׷ٸ ƴ 𸨴ϴٸ, $buf ־ 쿡 ˴ϴ. split Լ 3 μ ϸ(), split İ ڿ̾ 쿡 ڵ ˴ϴ. , ϴ ij "foo\nbar\n\n\n" split ϸ() ('foo', 'bar') ۿ ʱ , ('foo', 'bar', '', '') DZ⸦ ߴ ϴ.

ű⼭ ڿ ڵ Ű ʱ (), 3 μ split(/\x0D\x0A|\x0D|\x0A/, $buf, -1); () ϸ ƴұ ˷ ʽϴٸ, ̶̰ ߵ ʽϴ. , "foo\nbar\n" () split ϸ(), ̹ ('foo', 'bar', '') () ڵ ڿ ʰ ϴ. ű⼭, ̰Ϳ óϱ ؼ İ ڿ̾ 쿡 ϵ(), pop(@lines) if $lines[-1] eq ''; ϴ ֽϴ. ׷, ̰ ϴ Ϳ , Ȯ $bufsize ΰ ܶ İ ڵ忴 쿡 ʿ ϴ. Դٰ read ڿ $buf_tmp = "\n" if $buf_tmp eq ''; () ʿ䰡 ֽϴ. ̰ ǥ ϰ ˴ϴ. ٸ, ġũ ߴµ, ǥ () ä߽ϴ.

ž

ٱ⸸ ǥѴ

#  $file   ִ $nุ ǥѴ

$bufsize = 1024;
open(FILE, "< $file");
binmode(FILE);
$size = (-s FILE) / $bufsize;
$pos += $size <=> ($pos = int($size));
while ($pos--) {
  seek(FILE, $bufsize * $pos, 0);  
  read(FILE, $buf, $bufsize);
  $buf . = $buf_tmp;
  ($buf_tmp, @lines) = $buf =~ /[^\x0D\x0A]*\x0D? \x0A? /g;
  pop(@lines);
  unshift(@tail, @lines);
  last if @tail >= $n;
}
close(FILE);
unshift(@tail, $buf_tmp);
@tail = @tail[-$n .. -1] if @tail > $n;
foreach (@tail) {
  print $_;
}

ũƮ ⺻ ǥѴ ũƮ ϴ. ũƮ Ϳ Ͽ ּ. ̷μ $n ִ ٷ while () ϰ ִ ()Դϴ.

ǥϱ δ @tail ũ⸦ $n ũ $n 迭 ̽ ٽ ϰ ֽϴ. .. () , Ʈ κ Ʈ ݴϴ. , (-$n, -$n+1,..., -2, -1) ϴ Ʈ ˴ϴ. 迭 ÷ڰ 쿡 ڷκ Ұ ǹǷ, 迭 $n ̶ ˴ϴ.

ž

Ϸκ 1 Ѵ

#  $file κ 1  Ѵ

srand;
open(FILE, "< $file");
rand($. ) < 1 and $line = $_ while <FILE>;
close(FILE);
print $line;

ũƮ ü ޸𸮿 о ʴ´ Ƿ ޸ ų ֽϴ. , ̸ ˰ ʿ䵵 ϴ.

ü ؼ while Դϴٸ, 1ΰ о鿩 Ǵ κ while κԴϴ. κ 2 and () ϰ ֽϴ. and () 쿡 򰡵˴ϴ. , κ if Ͱ ǹ̰ ˴ϴ.

if (rand($. ) < 1) {
  $line = $_;
}

Ư $. () о ȣ ݴϴ. , ϴ Ȯ 1/$. () ˴ϴ. , 1/1, 1/2, 1/3 Ȯ ˴ϴ. ̰ 1 ִ° ϴ Դϴ. ϰ , 3 ̾ 쿡, õǴ , Ǿ, δ ¥ ʿ䰡 ֽϴ. , Ȯ 1/1 * (1 - 1/2) * (1 - 1/3) = 1/3 Ǿ, иϰ Ȯ ˴ϴ. õǴ , ¥ Դϴ. Ƿ Ǹ ϴٰ ϴ ()? , Ȯ 1/2 * (1 - 1/3) = 1/3 Ǿ, Ȯ ˴ϴ.

ž

丮()  䱸Ѵ

# 丮 $dir   $size  䱸Ѵ

use File::Find;

find(sub {$size += -s if -f}, $dir);
print $size, "bytes\n";

ũƮ$dir հ踦 䱸ϰ ֽϴ. 丮 Ǵ 丮 ؼ ΰ óϰ ʹ 쿡ǥ File::Find find Լ ϴ մϴ. Լ 2 μ 丮 ؼ, Ǵ 丮 Ž, ߰ߵǾ Ǵ 丮 $_ 1 1 μ Լ մϴ. ȮϰԴ 1 μ ()Լ ۷ ݴϴ. ũƮ Լ ۷ ְ ֽϴ. ̰ ᵵ ϴ.

# 丮 $dir   $size  䱸Ѵ(˱ )

use File::Find;

find(\&wanted, $dir);
print $size, "bytes\n";

sub wanted {
  $size += -s $_ if -f $_;
}

-s () ׽Ʈ ϳ  ݴϴ. -f () 丮 ȣ δ ˴ϴ. 켱 ƴϰ óϰ 쿡 finddepth Լ մϴ.

ž

±׸ Ѵ

$str EUC-JP ׷ٰ ϴ ̹Ƿ, ʿϸ ̸ EUC-JP ȯ ּ. ڵ ȯ ؼ ڵ带 EUC-JP () ȯ óѴ .

# $str  ±׸  $result  
# $tag_regex  $tag_regex_   

$text_regex = q{[^<]*};

$result = '';
while ($str =~ /($text_regex)($tag_regex)? /gso) {
  last if $1 eq '' and $2 eq '';
  $result . = $1;
  $tag_tmp = $2;
  if ($tag_tmp =~ /^<(XMP|PLAINTEXT|SCRIPT)(?![0-9A-Za-z]) /i) {
    $str =~ /(. *? )(? :<\/$1(?![0-9A-Za-z]) $tag_regex_|$) /gsi;
    ($text_tmp = $1) =~ s/</&lt;/g;
    $text_tmp =~ s/>/&gt;/g;
    $result . = $text_tmp;
  }
}

ũƮ ⺻ ڵ URI(URL) ũѴ ũƮ ϴ. ڼϰԴ ּ. $tag_regex $tag_regex_ ؼ HTML ± ǥ ũƮ ǥμ մϴ. , $str HTML ü ־ Ӵϴ. ǰ ʿ μ, XMP ± PLAINTEXT ± 쿡, װͱ ߿ ȿ ±װ ȿϰ Ǿ ɼ ִ Դϴ. ,XMP ± PLAINTEXT ± 쿡, < &lt; , > &gt; () ȯϰ ֽϴ. SCRIPT ± ؼ ϴ.

± < > ָ ±׸ ϴ ʴ찡 ֽϴ.

# $str  ±׸  $result  (ҿ)

($result = $str) =~ s/<[^>]*>//g;

üδ ֽϴ.

ũƮ ̷ 쿡 ߵǰ Ǿ ֽϴ. ٸ,HTML μ ùٸ ִ 츦 ֱ , < ϴ > ġ ϴ Ϳ ˴ϴ.

BR± Ư ±׸ ϰ 쿡, $tag_tmp = $2; ڿ, $tag_tmp $result ϵ ϸ ֽϴ.

  $result . = $tag_tmp if $tag_tmp =~ /^<\/? (BR|A)(?![0-9A-Za-z]) /i;

ݴ FONT ± IMG ± Ư ±׸ ϰ 쿡, $tag_tmp = $2; ڿ, $tag_tmp $result ϵ ϸ ֽϴ.

  $result . = $tag_tmp if $tag_tmp ! ~ /^<\/? (FONT|IMG)(?![0-9A-Za-z]) /i;

HTML::TokeParser get_text ޼ҵ, Ǵ get_trimmed_text ޼ҵ, striphtml () ص ϴ.

ž

ڵ URI(URL) ũѴ

$str EUC-JP ϴ ̹Ƿ, ʿϸ ̸ EUC-JP () ȯ ּ. ڵ ȯ ؼ ڵ带 EUC-JP () ȯ óѴ .

# $str  URI(URL) ũ $result  
# $tag_regex  $tag_regex_   
# $http_URL_regex  $ftp_URL_regex  $mail_regex   

$text_regex = q{[^<]*};

$result = '';  $skip = 0;
while ($str =~ /($text_regex)($tag_regex)? /gso) {
  last if $1 eq '' and $2 eq '';
  $text_tmp = $1;
  $tag_tmp = $2;
  if ($skip) {
    $result . = $text_tmp .  $tag_tmp;
    $skip = 0 if $tag_tmp =~ /^<\/[aA](?![0-9A-Za-z])/;
  } else {
    $text_tmp =~ s{($http_URL_regex|$ftp_URL_regex|($mail_regex))}
      {my($org, $mail) = ($1, $2);
       (my $tmp = $org) =~ s/"/&quot;/g;
       '<A HREF="' .  ($mail ne '' ?  'mailto:' : '') .  "$tmp\">$org</A>"}ego;
    $result . = $text_tmp .  $tag_tmp;
    $skip = 1 if $tag_tmp =~ /^<[aA](?![0-9A-Za-z])/;
    if ($tag_tmp =~ /^<(XMP|PLAINTEXT|SCRIPT)(?![0-9A-Za-z]) /i) {
      $str =~ /(. *? (? :<\/$1(?![0-9A-Za-z]) $tag_regex_|$)) /gsi;
      $result . = $1;
    }
  }
}

$http_URL_regex ؼ http URL ǥ, $ftp_URL_regex ؼ ftp URL ǥ, $mail_regex ؼ ּ ǥ ִ ũƮ ǥμ մϴ. , $tag_regex $tag_regex_ ؼ HTML ± ǥ ũƮ ǥμ մϴ. , $str HTML ü ־ Ӵϴ. ũƮ ׸ ʴ http URL ftp URL ּҿ ũմϴ.

ũƮ ϰ մϴ. $str ؼ, ؽƮ κа ± κ 1ΰ ã while ϴ. ± κ Ư ó ʿ ״Դϴ. $skip () ũϱ 1 ˴ϴ. ؽƮ κ Ư óϴ ״ մϴ. ݾ $skip 0 ǵϴ. ũ , ؽƮ κп http URL ftp URL Ǵ ּҸ ãƳ´ 쿡 ũմϴ.

, ± κ XMP ±, Ǵ, PLAINTEXT ± 쿡, ϴ ݰ ±ױ ŵ մϴ. ̶ ϴ ,while ִ ؽƮ κа ± κ DZ (), ݰ ± ŭ ָѴٰ ϴ Դϴ. ֳϸ, ̷ ± ȿ ٸ±װ ȿ() Ǿ, ״ ǥõDZ Դϴ. ݴ ϸ, ̷ ± ȿ ±׷ ±״ ƴϰ, ؽƮ Ǹ ϴ Դϴ. ٸ, κп http URL ftp URL, ּҰ ִ 쿡 ũ ġ ʽϴ. ƴٰ ص, װ ״ ǥõǾ ǹ̰ Դϴ. SCRIPT ± ؼ ϴ.

$str ġ ִ 2 g ٿ ִ Ϳ ָ ּ. g () ġĮ ϸ(), ȸ ġ ߴ ξ, κ ˻ ݴϴ. ũƮ ⺻ ؽƮ κа ± κ 1ΰ ã while ֽϴٸ,XMP ±, PLAINTEXT ±,SCRIPT ± ִ ó ʿ䰡 ֽϴ. ó while ƿ , κ ġ ޴´ ʿ䰡 ֽϴ. ̷ , $str ġ ξ g ٿ ֱ , 쵵 ħ κ ġ ִ Դϴ.

ġȯ ũϴ óԴϴٸ, ܼϰ 2 κ ˴ϴ.

    $text_tmp =~ s/($http_URL_regex) /<A HREF="$1">$1<\/A>/go;
    $text_tmp =~ s/($ftp_URL_regex) /<A HREF="$1">$1<\/A>/go;
    $text_tmp =~ s/($mail_regex) /<A HREF="mailto:$1">$1<\/A>/go;

1° , ± () ٺ긣ũƮ ѷδ , ٺ긣ũƮ ϰ () ȴٰ ϴ Դϴ. ű⼭, ٺ긣ũƮ ѷδ κп ؼ, Ϳ Եȴ ٺ긣ũƮ &quot; () ȯѴٰ ϴ ó ʿϰ ˴ϴ.

2()° Դϴٸ, ġȯ ó http URL, ftp URL, ּ پ ִٰ ϴ Դϴ. ̰͵ ٸ ǥ ϴ κ ֽϴ. ü ϸ, ֽϴ.

http://www.din.or.jp/~ohzaki/? ftp://ftp.din.or.jp/+ohzaki@din.or.jp
ftp://ftp.din.or.jp/ohzaki@din.or.jp
"http://www.din.or.jp/~ohzaki/? ftp://ftp.din.or.jp/"@din.or.jp

κ http URL,ftp URL, ּ ǰ ֽϴ. ̰͵ ġȯ ó , ּ Ϻθ http URL μ ġȯ ųhttp URL Ϻθ ftp URL μ ġȯ ٶ ϴ Ͼ ϴ. ԵǴ 𸣱 , ġȯ ó ʷ ִ ƴմϴ. ེԵ, κ ٸ ǥ , ̷ ġȯ ó 1 ǥ , ġȯ ó ϴ , ũ ֽϴ.

ž

ǥ

# ݰ ̽
$space = '\x20';

#  ̽
$Zspace = '(? :\xA1\xA1)'; # EUC-JP
$Zspace_sjis = '(? :\x81\x40)'; # SJIS

#   [0-9]
$Zdigit = '(? :\xA3[\xB0-\xB9])'; # EUC-JP
$Zdigit_sjis = '(? :\x82[\x4F-\x58])'; # SJIS

#  빮 [A-Z]
$Zuletter = '(? :\xA3[\xC1-\xDA])'; # EUC-JP
$Zuletter_sjis = '(? :\x82[\x60-\x79])'; # SJIS

#  ҹ [a-z]
$Zlletter = '(? :\xA3[\xE1-\xFA])'; # EUC-JP
$Zlletter_sjis = '(? :\x82[\x81-\x9A])'; # SJIS

#  ĺ [A-Za-z]
$Zalphabet = '(? :\xA3[\xC1-\xDA\xE1-\xFA])'; # EUC-JP
$Zalphabet_sjis = '(? :\x82[\x60-\x79\x81-\x9A])'; # SJIS

#  󰡳 [-]
$Zhiragana = '(? :\xA4[\xA1-\xF3])'; # EUC-JP
$Zhiragana_sjis = '(? :\x82[\x9F-\xF1])'; # SJIS

#  󰡳(Ȯ) [-Ȣ]
$ZhiraganaExt = '(? :\xA4[\xA1-\xF3]|\xA1[\xAB\xAC\xB5\xB6])'; # EUC-JP
$ZhiraganaExt_sjis = '(? :\x82[\x9F-\xF1]|\x81[\x4A\x4B\x54\x55])'; # SJIS

#  īŸī []
$Zkatakana = '(? :\xA5[\xA1-\xF6])'; # EUC-JP
$Zkatakana_sjis = '(? :\x83[\x40-\x96])'; # SJIS

#  īŸī(Ȯ) [-]
$ZkatakanaExt = '(? :\xA5[\xA1-\xF6]|\xA1[\xA6\xBC\xB3\xB4])'; # EUC-JP
$ZkatakanaExt_sjis = '(? :\x83[\x40-\x96]|\x81[\x45\x5B\x52\x53])'; # SJIS

# ݰ īŸī []
$Hkatakana = '(? :\x8E[\xA6-\xDF])'; # EUC-JP
$Hkatakana_sjis = '[\xA6-\xDF]'; # SJIS

# EUC-JP
$ascii = '[\x00-\x7F]'; # 1Ʈ EUC-JP
$twoBytes = '(? :[\x8E\xA1-\xFE][\xA1-\xFE])'; # 2Ʈ EUC-JP
$threeBytes = '(? :\x8F[\xA1-\xFE][\xA1-\xFE])'; # 3Ʈ EUC-JP
$character = "(? :$ascii|$twoBytes|$threeBytes) "; # EUC-JP

# EUC-JP(  ڡ 3Ʈ ڸ  ʴ´)
$character_strict = '(? :[\x00-\x7F]|' # ASCII
  .  '\x8E[\xA1-\xDF]|' # ݰ īŸī
  .  '[\xA1\xB0-\xCE\xD0-\xF3][\xA1-\xFE]|' # 1,16-46,48-83
  .  '\xA2[\xA1-\xAE\xBA-\xC1\xCA-\xD0\xDC-\xEA\xF2-\xF9\xFE]|' # 2
  .  '\xA3[\xB0-\xB9\xC1-\xDA\xE1-\xFA]|' # 3
  .  '\xA4[\xA1-\xF3]|' # 4
  .  '\xA5[\xA1-\xF6]|' # 5
  .  '\xA6[\xA1-\xB8\xC1-\xD8]|' # 6
  .  '\xA7[\xA1-\xC1\xD1-\xF1]|' # 7
  .  '\xA8[\xA1-\xC0]|' # 8
  .  '\xCF[\xA1-\xD3]|' # 47
  .  '\xF4[\xA1-\xA6])'; # 84

# EUC-JP Ĺ(  ڡ3Ʈ ڸ Ѵ)
$character_undef = '(? :[\xA9-\xAF\xF5-\xFE][\xA1-\xFE]|' # 9-15,85-94
  .  '\x8E[\xE0-\xFE]|' # ݰ īŸī
  .  '\xA2[\xAF-\xB9\xC2-\xC9\xD1-\xDB\xEB-\xF1\xFA-\xFD]|' # 2
  .  '\xA3[\XA1-\xAF\xBA-\xC0\xDB-\xE0\xFB-\xFE]|' # 3
  .  '\xA4[\xF4-\xFE]|' # 4
  .  '\xA5[\xF7-\xFE]|' # 5
  .  '\xA6[\xB9-\xC0\xD9-\xFE]|' # 6
  .  '\xA7[\xC2-\xD0\xF2-\xFE]|' # 7
  .  '\xA8[\xC1-\xFE]|' # 8
  .  '\xCF[\xD4-\xFE]|' # 47
  .  '\xF4[\xA7-\xFE]|' # 84
  .  '\x8F[\xA1-\xFE][\xA1-\xFE])'; # 3Ʈ 

# SJIS 
$oneByte_sjis = '[\x00-\x7F\xA1-\xDF]'; # 1Ʈ SJIS 
$twoBytes_sjis =
  '(? :[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC])'; # 2Ʈ SJIS 
$character_sjis = "(? :$oneByte_sjis|$twoBytes_sjis) "; # SJIS 

# SJIS (  ڡ   ʴ´)
$character_sjis_strict = '(? :[\x00-\x7F\xA1-\xDF]|' # ASCII, ݰ īŸī
  .  '[\x89-\x97\x99-\x9F\xE0-\xE9][\x40-\x7E\x80-\xFC]|' # 17-46,49-82
  .  '\x81[\x40-\x7E\x80-\xAC\xB8-\xBF\xC8-\xCE\xDA-\xE8\xF0-\xF7\xFC]|' # 1,2
  .  '\x82[\x4F-\x58\x60-\x79\x81-\x9A\x9F-\xF1]|' # 3,4
  .  '\x83[\x40-\x7E\x80-\x96\x9F-\xB6\xBF-\xD6]|' # 5,6
  .  '\x84[\x40-\x60\x70-\x7E\x80-\x91\x9F-\xBE]|' # 7,8
  .  '\x88[\x9F-\xFC]|' # 15,16
  .  '\x98[\x40-\x72\x9F-\xFC]|' # 47,48
  .  '\xEA[\x40-\x7E\x80-\xA4])'; # 83,84

# SJIS  Ĺ(  ڸ Ѵ)
$character_sjis_undef =
  '(? :[\x85-\x87\xEB-\xFC][\x40-\x7E\x80-\xFC]|' # 9-14,85-120
  .  '\x81[\xAD-\xB7\xC0-\xC7\xCF-\xD9\xE9-\xEF\xF8-\xFB]|' # 1,2
  .  '\x82[\x40-\x4E\x59-\x5F\x7A-\x7E\x80\x9B-\x9E\xF2-\xFC]|' # 3,4
  .  '\x83[\x97-\x9E\xB7-\xBE\xD7-\xFC]|' # 5,6
  .  '\x84[\x61-\x6F\x92-\x9E\xBF-\xFC]|' # 7,8
  .  '\x88[\x40-\x7E\x80-\x9E]|' # 15,16
  .  '\x98[\x73-\x7E\x80-\x9E]|' # 47,48
  .  '\xEA[\xA5-\xFC])'; # 83,84

# i  ׸  
$iPictograph_base = '(? :\xF8[\x9F-\xFC]|' # ⺻ ׸ (SJIS)
  .  '\xF9[\x40-\x49\x50-\x52\x55-\x57\x5B-\x5E\x72-\x7E\x80-\xB0])';
$iPictograph_ext = '(? :\xF9[\xB1-\xFC])'; # Ȯ ׸ (SJIS)
$iPictograph =
  '(? :$iPictograph_base|$iPictograph_ext)'; # i  ׸ (SJIS)

Ϻ ޿ ؼ Ϻ Ѵ .

ڿ ؼ ⿡ ʴ Ѵ. ֳϸ, ڴ vender ڵ ſ , ľϴ Ұϱ ̴. ũó ( ǿ vender ) ڿ شѴ.

ž

HTML ± ǥ

# HTML ±  ǥ $tag_regex

$tag_regex_ = q{[^"'<>]*(? :"[^"]*"[^"'<>]*|'[^']*'[^"'<>]*)*(? :>|(? =<)|$(?!\n))}; #'}}}}
$comment_tag_regex =
    '<! (? :--[^-]*-(? :[^-]+-)*? -(? :[^>-]*(? :-[^>-]+)*? )??)*(? :>|$(?!\n)|--. *$)';
$tag_regex = qq{$comment_tag_regex|<$tag_regex_};

ũƮ $comment_tag_regex ڸƮ ± ǥ, $tag_regex_ ڸƮ ± ̿ ± < ǥ ˴ϴ.

ʷ ± ǥ մϴ. ± ǥμ ʷ [^>]* Դϴ. ׷, ̰δ ٺ긣ũƮ ̸ũƮ ѷ ȿ > ־ 쿡 ˴ϴ. ű⼭, ٺ긣ũƮ ̸ũƮ մϴ.

ٺ긣ũƮ ѷο ִ κ ǥ "[^"]*" ֽϴ. ̸ũƮ ѷο ִ κп ؼ ϴ. ̰ ٺ긣ũƮ ̸ũƮ ѷο ִ ʿ > () ֽϴ. ̿ ٺ긣ũƮ ̸ũƮ ѷ κ ̹̾߸ [^>] ̴ϱ, ᱹ (? :[^>]|"[^"]*"|'[^']')* ׸ , ϸ() ׷ ʽϴ. [^>] ٺ긣ũƮ ̸ũƮ (), ó غ ٺ긣ũƮ ̸ũƮ ѷο ִ κ ǥ Ǵ ״ Ī Ǿ, ٺ긣ũƮ ̸ũƮ > ± ߸˾ ϴ.

̰ ȸϷ , (? :"[^"]*"|'[^']*'|[^>])* () ʷ ٺ긣ũƮ ̸ũƮ ѷο ִ  ϴ ֽϴ. ׷, ̰ иϰԴϴ. ֳϸ, ٺ긣ũƮ ̸ũƮ ѷ κ , 1 ٺ긣ũƮ ̸ũƮ Ī [^>] ʱ Դϴ. ű⼭ [^"'>] ϸ() ߵ˴ϴ.

$tag_regex_ = q{(? :[^"'>]|"[^"]*"|'[^']*')*}; #'}}}

ʴ ±׸ մϴ. ʴ ±׶ <P<B> > () Ǿ ִ Դϴ. <P () ùٸ ±׷μ νϱ ؼ, ± [^>]* ƴϰ [^<>]* ؾ ϰ ˴ϴ. , ± Ĵ ݵ > ٰ , (? :>|(? =<)|$(?!\n)) ʿ䰡 ֽϴ. ̰ > ±ΰ, Ǵ, ڰ ± < , Ǵ, ij 츦 Ÿ ֽϴ. $(?!\n) ؼ ڼϰ մϴ. ᱹ, ̰ ϸ() ˴ϴ.

$tag_regex_ = q{(? :[^"'<>]|"[^"]*"|'[^']*')*(? :>|(? =<)|$(?!\n))}; #'}}}

̰ Jeffrey E. F. Friedl Ѵ ؼ ǥ ( Mastering Regular Expressions ) μ ִ ӵ ũƮ ǥԴϴ. ġũ Ҵµ 1.5 ϴ.

ڸƮ ± ǥ մϴ. ڸƮ ±׿ ؼ, 켱 6ϰ龾 Ѵ SGML ּ ϵϴ õմϴ.

ڸƮ ±, , ּ --ڸƮ -- ׷ٰ ϴ ڸƮκ Ǿ ֽϴ. ڸƮ ±״ ڸƮ ڸƮ ڿ ڸ־ ϴ. , ڸƮ ̳ ڸƮ 0 ϴ. ٸ,<! () ڸƮȿ ڰ ִ 뼭 ʱ ,<! Ŀ ڸƮΰ ݰ ȣ > ۿ ͼ ǰ ˴ϴ. ̻κ, ڸƮ ± ǥ ˴ϴ.

#  ڸƮ ±  ǥ $comment_tag_regex

$comment_tag_regex = q{<! (? :--(? :(?!--). ) *--\s*) *>};

ڸƮ ± ǥ ʷ, ʴ ڸƮ ±׿ ڸƮ ڷ ̿ ڰ ־ ڸƮ ±׿ 쿡 ǥ ũƮ ˴ϴ.

(? :>|$(?!\n)|--. *$) , ڸƮ ±װ ݰ ־ , ڸƮ ڿ > ݾ , ڸƮ -- ڸƮ ӵǰ ִ 츦 Ÿ ֽϴ. $(?!\n) ׷, ܼ $ ̶ ƴұ ǹ Ǵ ϰ մϴٸ,$(?!\n) $ ǹ̰ ٸϴ. , $str = "test\n"; , m/^test$/ () մϴٸ,m/^test$(?!\n)/ ʽϴ. ֳϸ,$ ij ־ 쿡, ̶ ϴκԴϴ. , 'test' ־ , "test\n" ϱ⸦ ٶ ʴٰ , ܼ $ ׷ Դϴ. ڸƮ ± ǥ "<! \n" () 쿡 ޾Ƽ ϹǷ ̷ ǥ Ǿ ֽϴ. perl5. 005 Ķ $(?!\n) \z () ֽϴ. \z $ ̳ \Z () ޸ ǹ̷ ij մϴ.

ڸƮ ± ǥ ᵵ ϴ. Ϸ 켱 ǥ ʷ ϴ 𸨴ϴ.

# ڸƮ ±  ǥ(ʴ)

$comment_tag_regex = '<! (? :--(? :(?!--). )*--(? :(?!--)[^>])*)*(? :>|$(?!\n)|--. *$)';

ǥ, ڸƮ Ÿ ǥμ (? :(?!--). )* () ϰ ֽϴ. ̰ ǹ̴, -- ʴ ΰ 1 ݺ̶ ϴ Դϴ. , - ܵ Ÿ 쿡 , -- () Ÿ - () ȵȴٰ ϴ ˴ϴ. ̰ ڸƮ 뿡 -- Ÿ ʴ ˴ϴ. ڸƮ Ÿ ǥμ ̰ ùٸϴٸ, 1 -- ƴ üũϰ Ƿ ̴δ ӵ Դϴ.

ű⼭ ± ̿ϴ մϴ. 켱,-- ʴ ΰ 1 ݺ Ÿ (? :(?!--). )* () ٸ ٽ ǥմϴ. ǥ -- Ե ʴ κ̶ ϴ ̹Ƿ, 켱,-̿ ڶ ٷ ȴٰ մϴ. - Դٰ ص ڰ -̿ ̸ 쵵 ϴ. ,(? :(?!--). )* () (? :[^-]|-[^-])* () ֽϴ. ̰Ϳ ؼ ̿ϸ(), [^-]*(? :-[^-][^-]*)* Ǿ, ᱹ [^-]*(? :-[^-]+)* ˴ϴ.

̰ ڸƮ κ ǥ --[^-]*(? :-[^-]+)*-- Ǿϴ. ġũ ߴµ 2 ϴ. ׷, ũƮʹ ̰ ϴ. ڸƮ κ ǥ ֽϴ. װ - , װ ڸƮ ΰ, ڸƮ Ḧ Ÿ -- 1 ΰ 𸨴ϴٸ, ǥ ɼ ִ Ұ 2 Ǿ ִٰ ϴ Դϴ. ,(? :-[^-]+)* (? : - 𸣰, (? :-[^-]+)* - 𸨴ϴ. ̷ Ʈ ߻ÿ δ ϰ ˴ϴ. ű⼭, [^-]*(? :-[^-]+)*-- () , ϸ() [^-]*-(? :[^-]+-)*- ˴ϴ. ̰ - ϴ ǥ κ [^-]* - 1 ˴ϴ.

ũƮ ϴٸ, 1̰ ϴ. װ (? :[^-]+-)* () (? :[^-]+-)*? , , * () *? Դϴ. Ϲ * () *? () ٲٴ ϴ ͵ ٲ ϴ. ׷, ̹ * ׷ *? ׷ ݵ ˴ϴ. ݵ Ǵ ְ Ƿ, ӵ () մϴ. Ϲ ڸƮ ±׶ <-- ̰ ڸƮ ±Դϴ --> ϴ κ. , ڸƮ ± μ - () ϰ ִ 󵵴, 󵵺 ٰ ϴ Դϴ. , ڸƮ ± 뿡 - ϰ ־ (? :[^-]+-) κ ϰ ˴ϴ. ׷, δ ̱ , (? :[^-]+-) κ üũϴ ˴ϴ. ű⼭,* *? () ϴ ֽϴ.

(? :(?!--)[^>])* κп մϴ. ⵵ ڸƮ ± κа 켱 ̿ (? :[^>-]*(? :-[^>-]+)* () մϴ. , * () *? () ֱ , (? :[^>-]*(? :-[^>-]+)*? () ϴ () ϴ.

ũƮ ü (? : regex)?? () ?? () · ϰ ֽϴ. ̰, ڸƮ ± ޸, Ϲ ڸƮ Ḧ Ÿ -- ڿ ΰ ڰ  Ŀ > ִ 󵵰 ٰ DZ ()Դϴ. ٲپ ϸ(), (? :[^>-]*(? :-[^>-]+)*? ϴ , , Ű ϸ() ٰ DZ (), κ ǥ ü ?? () ٿ, üũŰ ʰ ϰ ֽϴ.

ž

URI(URL) ǥ

# $uri  ùٸ URI  Ѵ

$digit = q{[0-9]};
$upalpha = q{[A-Z]};
$lowalpha = q{[a-z]};
$alpha = qq{(? :$lowalpha|$upalpha)};
$alphanum = qq{(? :$alpha|$digit)};
$hex = qq{(? :$digit|[A-Fa-f])};
$escaped = qq{%$hex$hex};
$mark = q{[-_.!~*'()]};
$unreserved = qq{(? :$alphanum|$mark)};
$reserved = q{[;/? :@&=+$,]};
$uric = qq{(? :$reserved|$unreserved|$escaped)};
$fragment = qq{$uric*};
$query = qq{$uric*};
$pchar = qq{(? :$unreserved|$escaped|} .  q{[:@&=+$,])};
$param = qq{$pchar*};
$segment = qq{$pchar*(? :;$param)*};
$path_segments = qq{$segment(? :/$segment)*};
$abs_path = qq{/$path_segments};
$uric_no_slash = qq{(? :$unreserved|$escaped|} .  q{[;? :@&=+$,])};
$opaque_part = qq{$uric_no_slash$uric*};
$path = qq{(? :$abs_path|$opaque_part)? };
$port = qq{$digit*};
$IPv4address = qq{$digit+\\. $digit+\\. $digit+\\. $digit+};
$toplabel = qq{(? :$alpha|$alpha(? :$alphanum|-) *$alphanum)};
$domainlabel = qq{(? :$alphanum|$alphanum(? :$alphanum|-) *$alphanum)};
$hostname = qq{(? :$domainlabel\\. ) *$toplabel\\.?};
$host = qq{(? :$hostname|$IPv4address)};
$hostport = qq{$host(? ::$port)? };
$userinfo = qq{(? :$unreserved|$escaped|} .  q{[;:&=+$,])*};
$server = qq{(? :(? :$userinfo\@)? $hostport)? };
$reg_name = qq{(? :$unreserved|$escaped|} .  q{[$, ;:@&=+])+};
$authority = qq{(? :$server|$reg_name)};
$scheme = qq{$alpha(? :$alpha|$digit|[-+. ])*};
$rel_segment = qq{(? :$unreserved|$escaped|} .  q{[;@&=+$,])+};
$rel_path = qq{$rel_segment(? :$abs_path)? };
$net_path = qq{//$authority(? :$abs_path)? };
$hier_part = qq{(? :$net_path|$abs_path)(? :\\? $query)? };
$relativeURI = qq{(? :$net_path|$abs_path|$rel_path)(? :\\? $query)? };
$absoluteURI = qq{$scheme:(? :$hier_part|$opaque_part)};
$URI_reference = qq{(? :$absoluteURI|$relativeURI)? (? :#$fragment)? };

$pattern = $URI_reference;

print "ok\n" if $uri =~ /^$pattern$/o;

URI ؼ RFC 2396 ( Ϻ ) ֽϴ. װ ϰ ǥ ߴ ũƮԴϴ. ݺ 䱸 URI References ǥ Ǿϴ.

(? :(? :[a-z]|[A-Z])(? :(? :[a-z]|[A-Z])|[0-9]|[-+. ])*:(? :(? ://(? :(? :(
? :(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])
(? :[0-9]|[A-Fa-f])|[;:&=+$,])*@)? (? :(? :(? :(? :(? :[a-z]|[A-Z])|[0-9]
)|(? :(? :[a-z]|[A-Z])|[0-9])(? :(? :(? :[a-z]|[A-Z])|[0-9])|-)*(? :(? :[
a-z]|[A-Z])|[0-9]))\. )*(? :(? :[a-z]|[A-Z])|(? :[a-z]|[A-Z])(? :(? :(? :
[a-z]|[A-Z])|[0-9])|-)*(? :(? :[a-z]|[A-Z])|[0-9]))\.?|[0-9]+\. [0-9]
+\. [0-9]+\. [0-9]+)(? ::[0-9]*)? )? |(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[
-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|[$, ;:@&=+])+)(? :
/(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(
? :[0-9]|[A-Fa-f])|[:@&=+$,])*(? :;(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[
-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|[:@&=+$,])*)*(? :
/(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(
? :[0-9]|[A-Fa-f])|[:@&=+$,])*(? :;(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[
-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|[:@&=+$,])*)*)*)
? |/(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f]
)(? :[0-9]|[A-Fa-f])|[:@&=+$,])*(? :;(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])
|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|[:@&=+$,])*)*(
? :/(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f]
)(? :[0-9]|[A-Fa-f])|[:@&=+$,])*(? :;(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])
|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|[:@&=+$,])*)*)
*)(? :\? (? :[;/? :@&=+$,]|(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%
(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f]))*)? |(? :(? :(? :(? :[a-z]|[A-Z])|
[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|[;? :@&=+
$,])(? :[;/? :@&=+$,]|(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :
[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f]))*)|(? ://(? :(? :(? :(? :(? :(? :(? :[a-
z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f]
)|[;:&=+$,])*@)? (? :(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|(? :(? :[a-z]|[A-
Z])|[0-9])(? :(? :(? :[a-z]|[A-Z])|[0-9])|-)*(? :(? :[a-z]|[A-Z])|[0-9]
))\. )*(? :(? :[a-z]|[A-Z])|(? :[a-z]|[A-Z])(? :(? :(? :[a-z]|[A-Z])|[0-9
])|-)*(? :(? :[a-z]|[A-Z])|[0-9]))\.?|[0-9]+\. [0-9]+\. [0-9]+\. [0-9]+
)(? ::[0-9]*)? )? |(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[
0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|[$, ;:@&=+])+)(? :/(? :(? :(? :(? :[a-z
]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])
|[:@&=+$,])*(? :;(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[
0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|[:@&=+$,])*)*(? :/(? :(? :(? :(? :[a-z
]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])
|[:@&=+$,])*(? :;(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[
0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|[:@&=+$,])*)*)*)? |/(? :(? :(? :(? :[a
-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f
])|[:@&=+$,])*(? :;(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(?
:[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|[:@&=+$,])*)*(? :/(? :(? :(? :(? :[a
-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f
])|[:@&=+$,])*(? :;(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(?
:[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|[:@&=+$,])*)*)*|(? :(? :(? :(? :[a-
z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f]
)|[;@&=+$,])+(? :/(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :
[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|[:@&=+$,])*(? :;(? :(? :(? :(? :[a-z]
|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|
[:@&=+$,])*)*(? :/(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :
[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|[:@&=+$,])*(? :;(? :(? :(? :(? :[a-z]
|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|
[:@&=+$,])*)*)*)? )(? :\? (? :[;/? :@&=+$,]|(? :(? :(? :[a-z]|[A-Z])|[0-9]
)|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f]))*)? )? (? :#(? :[
;/? :@&=+$,]|(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A
-Fa-f])(? :[0-9]|[A-Fa-f]))*)?

ǥ ʹ ʹ Ϲ̾, κ Է¿ ؼ ϴ. RFC 2396 URI Ϲ ߴ ̹Ƿ, ǥ ϴ ٰ ص Դϴ.

ž

http URL ǥ

# $http  ùٸ http URL  Ѵ

$digit = q{[0-9]};
$upalpha = q{[A-Z]};
$lowalpha = q{[a-z]};
$alpha = qq{(? :$lowalpha|$upalpha)};
$alphanum = qq{(? :$alpha|$digit)};
$hex = qq{(? :$digit|[A-Fa-f])};
$escaped = qq{%$hex$hex};
$mark = q{[-_.!~*'()]};
$unreserved = qq{(? :$alphanum|$mark)};
$reserved = q{[;/? :@&=+$,]};
$uric = qq{(? :$reserved|$unreserved|$escaped)};
$query = qq{$uric*};
$pchar = qq{(? :$unreserved|$escaped|} .  q{[:@&=+$,])};
$param = qq{$pchar*};
$segment = qq{$pchar*(? :;$param)*};
$path_segments = qq{$segment(? :/$segment)*};
$abs_path = qq{/$path_segments};
$port = qq{$digit*};
$IPv4address = qq{$digit+\\. $digit+\\. $digit+\\. $digit+};
$toplabel = qq{(? :$alpha|$alpha(? :$alphanum|-) *$alphanum)};
$domainlabel = qq{(? :$alphanum|$alphanum(? :$alphanum|-) *$alphanum)};
$hostname = qq{(? :$domainlabel\\. ) *$toplabel\\.?};
$host = qq{(? :$hostname|$IPv4address)};
$http_URL = qq{http://$host(? ::$port)? (? :$abs_path(? :\\? $query)? )? };

$pattern = $http_URL;

print "ok\n" if $http =~ /^$pattern$/;

http URL ؼ RFC 2616 3.2. 2 http URL ֽϴ. ũƮ, URI(URL) ǥ URI(URL) ǥ ũƮ , http URL ǥ Դϴ. ũƮκ 䱸 http URL ǥ Ǿϴ.

http://(? :(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|(? :(? :[a-z]|[A-Z])|[0-9]
)(? :(? :(? :[a-z]|[A-Z])|[0-9])|-)*(? :(? :[a-z]|[A-Z])|[0-9]))\. )*(? :
(? :[a-z]|[A-Z])|(? :[a-z]|[A-Z])(? :(? :(? :[a-z]|[A-Z])|[0-9])|-)*(? :
(? :[a-z]|[A-Z])|[0-9]))\.?|[0-9]+\. [0-9]+\. [0-9]+\. [0-9]+)(? ::[0-9
]*)? (? :/(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-
Fa-f])(? :[0-9]|[A-Fa-f])|[:@&=+$,])*(? :;(? :(? :(? :(? :[a-z]|[A-Z])|[
0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|[:@&=+$,]
)*)*(? :/(? :(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()])|%(? :[0-9]|[A-
Fa-f])(? :[0-9]|[A-Fa-f])|[:@&=+$,])*(? :;(? :(? :(? :(? :[a-z]|[A-Z])|[
0-9])|[-_.!~*'()])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f])|[:@&=+$,]
)*)*)*(? :\? (? :[;/? :@&=+$,]|(? :(? :(? :[a-z]|[A-Z])|[0-9])|[-_.!~*'()
])|%(? :[0-9]|[A-Fa-f])(? :[0-9]|[A-Fa-f]))*)? )?

http URL ǥ, Ŭ ʱ дϴ. ű⼭, Ŭ ϵ() Ͽ Ϻ մϴ.

# $http  ùٸ http URL  Ѵ( Ŭ )

$alpha = q{[a-zA-Z]};
$alphanum = q{[a-zA-Z0-9]};
$hex = q{[0-9A-Fa-f]};
$uric = q{(? :[-_.!~*'() a-zA-Z0-9;/? :@&=+$,]} .  qq{|$escaped)};
$pchar = q{(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]} .  qq{|$escaped)};
$toplabel = qq{(? :$alpha|$alpha} .  q{[-a-zA-Z0-9]*} .  qq{$alphanum)};
$domainlabel = qq{(? :$alphanum|$alphanum} .  q{[-a-zA-Z0-9]*} .  qq{$alphanum)};

ũƮκ 䱸 http URL ǥ Ǿϴ.

http://(? :(? :(? :[a-zA-Z0-9]|[a-zA-Z0-9][-a-zA-Z0-9]*[a-zA-Z0-9])\.
)*(? :[a-zA-Z]|[a-zA-Z][-a-zA-Z0-9]*[a-zA-Z0-9])\.?|[0-9]+\. [0-9]+\
. [0-9]+\. [0-9]+)(? ::[0-9]*)? (? :/(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0
-9A-Fa-f][0-9A-Fa-f])*(? :;(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa
-f][0-9A-Fa-f])*)*(? :/(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa-f][
0-9A-Fa-f])*(? :;(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa-f][0-9A-F
a-f])*)*)*(? :\? (? :[-_.!~*'() a-zA-Z0-9;/? :@&=+$,]|%[0-9A-Fa-f][0-9A
-Fa-f])*)? )?

ǥ ߽ϴٸ,RFC 2616 3.2. 2 http URL ֽϴ. RFC 2616 HTTP ݿ ־3.2. 2 http URL ִ http URL , HTTP () ̾߱Ⱑ ˴ϴ. Ϲ, HTML ũ Ǵ , ϰ HTTP () Ǵ http URL ƴϰ, scheme http URI References Դϴ.

http://user:passwd@www.din.or.jp/~ohzaki/perl.htm#URI () URI References Դϴٸ,user:passwd@ κ, , userinfo ,#URI κ, , Fragment Identifier HTTP () ȴ http URL μ ˴ϴ. ׷, HTML ũμ ϴ. ֳϸ, Ŭ̾Ʈ() HTTP ݷ Ϸ װ͵ ϰ ֱ Դϴ. Դϴٸ, RFC 2396 ( Ϻ ) 4 Fragment Identifier URI Ϻο ̶ ֽϴ. Fragment Identifier user agent ؼǴ ΰ մϴ.

, scheme http URI References մϴ. ű⼭ ٽ URI(URL) ǥ URI(URL) ǥ ũƮ ϴ. , HTTP () Ǵ http URL () ϴµ ʿ ݵ ϰ , ̿ܿ Ȳ ϰ ־ ٰ մϴ. ʿ , host, port, abs_path, query Դϴ. , scheme 翬 http Դϴٸ, , Secure Hyper Text Tranasfer Protocol(S-HTTP) Ҹ ϴ shttp: Secure Sockets Layer(SSL) ׷ٰ ϴ ϴ https: ϵ() Ӵϴ. , Ͽ Ϻθ ϰ Ǿϴ.

$server = qq{(? :$userinfo\@)? $hostport};
$authority = qq{$server};
$scheme = q{(? :https? |shttp)};
$hier_part = qq{$net_path(? :\\? $query)? };
$absoluteURI = qq{$scheme:$hier_part};
$URI_reference = qq{$absoluteURI(? :#$fragment)? };

̰Ϳ Ŭ ϴ μ Ͽ Ϻ () ߽ϴ.

$alpha = q{[a-zA-Z]};
$alphanum = q{[a-zA-Z0-9]};
$hex = q{[0-9A-Fa-f]};
$unreserved = q{[-_.!~*'() a-zA-Z0-9]};
$uric = q{(? :[-_.!~*'() a-zA-Z0-9;/? :@&=+$,]} .  qq{|$escaped)};
$pchar = q{(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]} .  qq{|$escaped)};
$toplabel = qq{(? :$alpha|$alpha} .  q{[-a-zA-Z0-9]*} .  qq{$alphanum)};
$domainlabel = qq{(? :$alphanum|$alphanum} .  q{[-a-zA-Z0-9]*} .  qq{$alphanum)};
$userinfo = q{(? :[-_.!~*'() a-zA-Z0-9;:&=+$,]|} .  qq{$escaped)*};

̿ 䱸 ǥ Ǿϴ.

(? :https? |shttp)://(? :(? :[-_.!~*'() a-zA-Z0-9;:&=+$,]|%[0-9A-Fa-f][
0-9A-Fa-f])*@)? (? :(? :(? :[a-zA-Z0-9]|[a-zA-Z0-9][-a-zA-Z0-9]*[a-zA-
Z0-9])\. )*(? :[a-zA-Z]|[a-zA-Z][-a-zA-Z0-9]*[a-zA-Z0-9])\.?|[0-9]+\
. [0-9]+\. [0-9]+\. [0-9]+)(? ::[0-9]*)? (? :/(? :[-_.!~*'() a-zA-Z0-9:@&=
+$,]|%[0-9A-Fa-f][0-9A-Fa-f])*(? :;(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%
[0-9A-Fa-f][0-9A-Fa-f])*)*(? :/(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9
A-Fa-f][0-9A-Fa-f])*(? :;(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa-f
][0-9A-Fa-f])*)*)*)? (? :\? (? :[-_.!~*'() a-zA-Z0-9;/? :@&=+$,]|%[0-9A-
Fa-f][0-9A-Fa-f])*)? (? :#(? :[-_.!~*'() a-zA-Z0-9;/? :@&=+$,]|%[0-9A-F
a-f][0-9A-Fa-f])*)?

ǥ ϸ, $http , scheme http URI References ȭ ֽϴ. ׷ , ij κ http URL ϴ ǥ ص ߵ ʽϴ. , ũƮ ϸ() ʴ дϴ.

# $str κ http URI References  Ѵ

$str = "  URI  http://www.din.or.jp/~ohzaki/perl.htm Դϴ ";

$pattern = $URI_reference;

while ($str =~ /($pattern) /g) {
  print $1, "\n";
}

 (п)
http://www.din.or.j

̷ Ǿ ȴ. װ Perl ġ ڵ ġ NFAs(Nondeterministic Finite Automata) ̱ Դϴ. ũƮ .

print " 1  or ڷ ۵Ǿ ڳ ҹڰ ӵǴ  \n";
$str = '123abc';
@patterns = ('(? :\d|\d[0-9a-z]+)', '(? :\d[0-9a-z]*)');
foreach $pattern (@patterns) {
  print "  ij  $str   $pattern  ";
  print ' ' .  join('/', $str =~ /$pattern/g) .  "\n";
}
print "\n 1  or ʰ ڳ ҹڷ,  ҹ  \n";
$str = '1a';
@patterns = ('(? :\d|[\da-z][a-z])', '(? :[\da-z][a-z]|\d)');
foreach $pattern (@patterns) {
  print "  ij  $str   $pattern  ";
  print ' ' .  join('/', $str =~ /$pattern/g) .  "\n";
}

 
 1  or ʰ ڷ,   ڳ ҹڰ ӵǴ 
  ij  123abc   (? :\d|\d[0-9a-z]+)  1/2/3
  ij  123abc   (? :\d[0-9a-z]*)  123abc

 1  or ʰ ڳ ҹڷ,  ҹ 
  ij  1a   (? :\d|[\da-z][a-z])  1
  ij  1a   (? :[\da-z][a-z]|\d)  1a

2() , ̳ ǥ ij Ϻ ۿ ϰ ʴ ȴٰ մϴ. ̿ Perl ġ ij ο ų ִ 쿡, ʷ ߰ߵ ġ ϴ. ׷ () ǥ ij ü ų ־.

1° , (? :regex1|regex1regex2+) ׷ٰ ϴ regex1regex2* ϴ ¿ , Ÿ ʰ ϰ ֽϴ. ̿ ϴ , ų ־ , κ 쿡 Ʈ Ƿ ȿ ˴ϴ. ̰Ͱ , Ͽ Ϻ մϴ.

$toplabel = qq{$alpha(? :} .  q{[-a-zA-Z0-9]*} .  qq{$alphanum)? };
$domainlabel = qq{$alphanum(? :} .  q{[-a-zA-Z0-9]*} .  qq{$alphanum)? };

2° , (? :regex1|regex2) ׷ٰ ϴ ,regex1 regex2 Ϻο 쿡, ɼ̴. regex2 () ϴ regex1 õǾ ȴ () ij Ϻο Ƚϴ. ݴ , (? :regex2|regex1) ¿ ϴ , ̷ ¸ ֽϴ. ɼ ִ κ̶ ϸ(), host ǥ hostname IPv4address κ ˴ϴ. ֳϸ, IPv4address ǥ hostname Ϻο ɼ ֱ Դϴ. , 127.0. 0.1.www.din.or.jp ϴ host ־ , IPv4address () 127.0. 0.1 κп ϴ. , ʺ host ǥ hostname () Ű Ǿ ֱ , Ư ʿ ˴ϴ.

,pseudohttp://foo/bar.htm () HTTP ƴ scheme ߺ , Ͽ մϴ.

$http_URL_regex = q{\b} .  $URI_reference;

̻ ũƮ Ͽ Ǿϴ.

# http URL   ǥ $http_URL_regex

$digit = q{[0-9]};
$alpha = q{[a-zA-Z]};
$alphanum = q{[a-zA-Z0-9]};
$hex = q{[0-9A-Fa-f]};
$escaped = qq{%$hex$hex};
$uric = q{(? :[-_.!~*'() a-zA-Z0-9;/? :@&=+$,]} .  qq{|$escaped)};
$fragment = qq{$uric*};
$query = qq{$uric*};
$pchar = q{(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]} .  qq{|$escaped)};
$param = qq{$pchar*};
$segment = qq{$pchar*(? :;$param)*};
$path_segments = qq{$segment(? :/$segment)*};
$abs_path = qq{/$path_segments};
$port = qq{$digit*};
$IPv4address = qq{$digit+\\. $digit+\\. $digit+\\. $digit+};
$toplabel = qq{$alpha(? :} .  q{[-a-zA-Z0-9]*} .  qq{$alphanum)? };
$domainlabel = qq{$alphanum(? :} .  q{[-a-zA-Z0-9]*} .  qq{$alphanum)? };
$hostname = qq{(? :$domainlabel\\. ) *$toplabel\\.?};
$host = qq{(? :$hostname|$IPv4address)};
$hostport = qq{$host(? ::$port)? };
$userinfo = q{(? :[-_.!~*'() a-zA-Z0-9;:&=+$,]|} .  qq{$escaped)*};
$server = qq{(? :$userinfo\@)? $hostport};
$authority = qq{$server};
$scheme = q{(? :https? |shttp)};
$net_path = qq{//$authority(? :$abs_path)? };
$hier_part = qq{$net_path(? :\\? $query)? };
$absoluteURI = qq{$scheme:$hier_part};
$URI_reference = qq{$absoluteURI(? :#$fragment)? };
$http_URL_regex = q{\b} .  $URI_reference;

ũƮκ 䱸 http URL ǥ Ǿϴ.

\b(? :https? |shttp)://(? :(? :[-_.!~*'() a-zA-Z0-9;:&=+$,]|%[0-9A-Fa-f
][0-9A-Fa-f])*@)? (? :(? :[a-zA-Z0-9](? :[-a-zA-Z0-9]*[a-zA-Z0-9])? \. )
*[a-zA-Z](? :[-a-zA-Z0-9]*[a-zA-Z0-9])? \.?|[0-9]+\. [0-9]+\. [0-9]+\.
[0-9]+)(? ::[0-9]*)? (? :/(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa-f]
[0-9A-Fa-f])*(? :;(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa-f][0-9A-
Fa-f])*)*(? :/(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f
])*(? :;(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f])*)*)
*)? (? :\? (? :[-_.!~*'() a-zA-Z0-9;/? :@&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f])
*)? (? :#(? :[-_.!~*'() a-zA-Z0-9;/? :@&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f])*
)?

ǥ ϸ,http URL ߵǵ() ˴ϴ. ϰ ̰ ϴ ũƮ ˴ϴ.

$http_URL_regex =
q{\b(? :https? |shttp)://(? :(? :[-_.!~*'() a-zA-Z0-9;:&=+$,]|%[0-9A-Fa-f} .
q{][0-9A-Fa-f])*@)? (? :(? :[a-zA-Z0-9](? :[-a-zA-Z0-9]*[a-zA-Z0-9])? \. )} .
q{*[a-zA-Z](? :[-a-zA-Z0-9]*[a-zA-Z0-9])? \.?|[0-9]+\. [0-9]+\. [0-9]+\. } .
q{[0-9]+)(? ::[0-9]*)? (? :/(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa-f]} .
q{[0-9A-Fa-f])*(? :;(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa-f][0-9A-} .
q{Fa-f])*)*(? :/(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f} .
q{])*(? :;(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f])*)*)} .
q{*)? (? :\? (? :[-_.!~*'() a-zA-Z0-9;/? :@&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f])} .
q{*)? (? :#(? :[-_.!~*'() a-zA-Z0-9;/? :@&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f])*} .
q{)? };

׷, Խϴٸ, Ȯϰ ǥ , ϰ ƶ ϴ http URL ǥ ϰ ˴ϴ.

s? https? ://[-_.!~*'() a-zA-Z0-9;/? :@&=+$,%#]+

ǥ ϴ ϴ ϴٸ, ǥμ ̿ϴ ʿ䰡 ֽϴ.

#  $text κ http URL   @http  ݳѴ

@http = $text =~ /s? https? :\/\/[-_.!~*'() a-zA-Z0-9;\/? :\@&=+\$,%#]+/g;

/ \/ () Ǿ ִ ٰ մϴ. Ư Ǵ ,$ ̶ @ κԴϴ. ̰͵ ״δ Į 迭 μ ٷ Ǿ ϴ. ű⼭ 2() ؼ \$ \@ () ʿ䰡 ֽϴ. , 2() \ ̰ ذ ־  Ǵ? ,$ ؼƯ $, ()μ ڿ Ǿ ϴ. @ ؼ @& ׸ ۵Ǵ 迭 ʱ , 迭 μ ٷ ʰ ״ ˴ϴ.

ž

ftp URL ǥ

# ftp URL   ǥ $ftp_URL_regex

$digit = q{[0-9]};
$alpha = q{[a-zA-Z]};
$alphanum = q{[a-zA-Z0-9]};
$hex = q{[0-9A-Fa-f]};
$escaped = qq{%$hex$hex};
$uric = q{(? :[-_.!~*'() a-zA-Z0-9;/? :@&=+$,]} .  qq{|$escaped)};
$fragment = qq{$uric*};
$query = qq{$uric*};
$pchar = q{(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]} .  qq{|$escaped)};
$segment = qq{$pchar*};
$ftptype = q{[AIDaid]};
$path_segments = qq{$segment(? :/$segment)*(? :;type=$ftptype)? };
$abs_path = qq{/$path_segments};
$port = qq{$digit*};
$IPv4address = qq{$digit+\\. $digit+\\. $digit+\\. $digit+};
$toplabel = qq{$alpha(? :} .  q{[-a-zA-Z0-9]*} .  qq{$alphanum)? };
$domainlabel = qq{$alphanum(? :} .  q{[-a-zA-Z0-9]*} .  qq{$alphanum)? };
$hostname = qq{(? :$domainlabel\\. ) *$toplabel\\.?};
$host = qq{(? :$hostname|$IPv4address)};
$hostport = qq{$host(? ::$port)? };
$user = q{(? :[-_.!~*'() a-zA-Z0-9;&=+$,]|} .  qq{$escaped)*};
$password = $user;
$userinfo = qq{$user(? ::$password)? };
$server = qq{(? :$userinfo\@)? $hostport};
$authority = qq{$server};
$scheme = q{ftp};
$net_path = qq{//$authority(? :$abs_path)? };
$hier_part = qq{$net_path(? :\\? $query)? };
$absoluteURI = qq{$scheme:$hier_part};
$URI_reference = qq{$absoluteURI(? :#$fragment)? };
$ftp_URL_regex = q{\b} .  $URI_reference;

ftp URL ؼ RFC 1738 ֽϴ. ٸ, RFC 1738 RFC 2396 ( Ϻ ) ŵǰ ֽϴ. ł ִٰ ص RFC 2396 URI Ϲ nj XX Ƿ, ftp URL ǿ 낳 ִ κ ϴ. ű⼭,ftp URL ǥμRFC 2396 URI Ϲ Ǹ ʷ, http URL ǥ Ŵ http URI References μ 䱸 , Ŵ ftp URI References XXϴ.

RFC 1738 XX ִ ftp URL Ǹ κ Ͽ ϴ.

$segment = qq{$pchar*};
$ftptype = q{[AIDaid]};
$path_segments = qq{$segment(? :/$segment)*(? :;type=$ftptype)? };
$user = q{(? :[-_.!~*'() a-zA-Z0-9;&=+$,]|} .  qq{$escaped)*};
$password = $user;
$userinfo = qq{$user(? ::$password)? };
$server = qq{(? :$userinfo\@)? $hostport};
$authority = qq{$server};
$scheme = q{ftp};
$net_path = qq{//$authority(? :$abs_path)? };
$hier_part = qq{$net_path(? :\\? $query)? };
$absoluteURI = qq{$scheme:$hier_part};
$URI_reference = qq{$absoluteURI(? :#$fragment)? };
$ftp_URL_regex = q{\b} .  $URI_reference;

ftp URL RFC 1738 ftpurl = "ftp://" login [ "/" fpath [ ";type=" ftptype ]] () ǵǰ ֽϴ. login κ path_segments شǴ Դϴٸ, ; () fpath κ ܶ ˴ϴ. ű⼭, segment κ ; () param , path_segments ftp URL ǿ ϵ() XXϴ. () login κ login = [ user [ ":" password ] "@" ] hostport ǵǰ ־ userinfo user [ ":" password ] ǰ ֽϴ. ,: user password ܶ XX (), userinfo κ : () Ӱ user, password μ userinfo ߽ϴ. scheme 翬 ftp ظ, Ŵ ftp URI References ()μ κ Ȃ URI_reference absoluteURI ߽ϴ.

ũƮκ 䱸 ftp URL ǥ Ǿϴ.

\bftp://(? :(? :[-_.!~*'() a-zA-Z0-9;&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f])*
(? ::(? :[-_.!~*'() a-zA-Z0-9;&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f])*)? @)? (?
:(? :[a-zA-Z0-9](? :[-a-zA-Z0-9]*[a-zA-Z0-9])? \. )*[a-zA-Z](? :[-a-zA-
Z0-9]*[a-zA-Z0-9])? \.?|[0-9]+\. [0-9]+\. [0-9]+\. [0-9]+)(? ::[0-9]*)?
(? :/(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f])*(? :/(?
:[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f])*)*(? :;type=[
AIDaid])? )? (? :\? (? :[-_.!~*'() a-zA-Z0-9;/? :@&=+$,]|%[0-9A-Fa-f][0-9
A-Fa-f])*)? (? :#(? :[-_.!~*'() a-zA-Z0-9;/? :@&=+$,]|%[0-9A-Fa-f][0-9A
-Fa-f])*)?

ϰ ̰ ԓ ϴ ũƮ ϴ.

$ftp_URL_regex =
q{\bftp://(? :(? :[-_.!~*'() a-zA-Z0-9;&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f])*} .
q{(? ::(? :[-_.!~*'() a-zA-Z0-9;&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f])*)? @)? (? } .
q{:(? :[a-zA-Z0-9](? :[-a-zA-Z0-9]*[a-zA-Z0-9])? \. )*[a-zA-Z](? :[-a-zA-} .
q{Z0-9]*[a-zA-Z0-9])? \.?|[0-9]+\. [0-9]+\. [0-9]+\. [0-9]+)(? ::[0-9]*)? } .
q{(? :/(? :[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f])*(? :/(? } .
q{:[-_.!~*'() a-zA-Z0-9:@&=+$,]|%[0-9A-Fa-f][0-9A-Fa-f])*)*(? :;type=[} .
q{AIDaid])? )? (? :\? (? :[-_.!~*'() a-zA-Z0-9;/? :@&=+$,]|%[0-9A-Fa-f][0-9} .
q{A-Fa-f])*)? (? :#(? :[-_.!~*'() a-zA-Z0-9;/? :@&=+$,]|%[0-9A-Fa-f][0-9A} .
q{-Fa-f])*)? };
ž

ּ ǥ

RFC 821 RFC 822 RFC 2821 1~3 4,5 6~ )() RFC 2822 ( Ϻ ) obsolete Ǿϴ.

ּҿ ؼ RFC 821 ( Ϻ ) RFC 822 ( Ϻ ) ֽϴ. perl5. 6.0 perl ׷ ּ ǥ Ȯϰ ϴ ϴ. Jeffrey E. F. Friedl ǂ ؼ ǥ ( Mastering Regular Expressions ) ּҴ ׽Ʈ ߴ ڸƮ ƂǷ ǥ Ÿ; Ұɑٶ ֽϴ. ű⼭,Jeffrey E. F. Friedl ׽Ʈ Pe ڸƮ , 6,598Ʈ ޏ ǥ XX. http://public.yahoo.com/~jfriedl/regex/email-opt.pl ڵ尡 ֽϴ.

[\040\t]*(? :\([^\\\x80-\xff\n\015()]*(? :(? :\\[^\x80-\xff]|\([^\\\x
80-\xff\n\015()]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\
\x80-\xff\n\015()]*)*\)[\040\t]*)*(? :(? :[^(\040) <>@, ;:". \\\[\]\000
-\037\x80-\xff]+(?![^(\040) <>@, ;:". \\\[\]\000-\037\x80-\xff]) |"[^\
\\x80-\xff\n\015"]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015"]*) *")[\04
0\t]*(? :\([^\\\x80-\xff\n\015()]*(? :(? :\\[^\x80-\xff]|\([^\\\x80-\
xff\n\015()]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80
-\xff\n\015()]*)*\)[\040\t]*)*(? :\. [\040\t]*(? :\([^\\\x80-\xff\n\0
15()]*(? :(? :\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(? :\\[^\x80-\x
ff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]
*)*(? :[^(\040) <>@, ;:". \\\[\]\000-\037\x80-\xff]+(?![^(\040) <>@, ;:"
. \\\[\]\000-\037\x80-\xff]) |"[^\\\x80-\xff\n\015"]*(? :\\[^\x80-\xf
f][^\\\x80-\xff\n\015"]*) *")[\040\t]*(? :\([^\\\x80-\xff\n\015()]*(
? :(? :\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(? :\\[^\x80-\xff][^\\
\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*)*@[
\040\t]*(? :\([^\\\x80-\xff\n\015()]*(? :(? :\\[^\x80-\xff]|\([^\\\x8
0-\xff\n\015()]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\
x80-\xff\n\015()]*)*\)[\040\t]*)*(? :[^(\040) <>@, ;:". \\\[\]\000-\03
7\x80-\xff]+(?![^(\040) <>@, ;:". \\\[\]\000-\037\x80-\xff])|\[(? :[^\
\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])[\040\t]*(? :\([^\\\x80-\x
ff\n\015()]*(? :(? :\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(? :\\[^\
x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\
040\t]*)*(? :\. [\040\t]*(? :\([^\\\x80-\xff\n\015()]*(? :(? :\\[^\x80-
\xff]|\([^\\\x80-\xff\n\015()]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\01
5()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(? :[^(\040) <>@, ;:"
. \\\[\]\000-\037\x80-\xff]+(?![^(\040) <>@, ;:". \\\[\]\000-\037\x80-
\xff])|\[(? :[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])[\040\t]*(
? :\([^\\\x80-\xff\n\015()]*(? :(? :\\[^\x80-\xff]|\([^\\\x80-\xff\n\
015()]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\
n\015()]*)*\)[\040\t]*)*)*|(? :[^(\040) <>@, ;:". \\\[\]\000-\037\x80-
\xff]+(?![^(\040) <>@, ;:". \\\[\]\000-\037\x80-\xff]) |"[^\\\x80-\xff
\n\015"]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015"]*) *")[^() <>@, ;:". \\
\[\]\x80-\xff\000-\010\012-\037]*(? :(? :\([^\\\x80-\xff\n\015()]*(?
:(? :\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(? :\\[^\x80-\xff][^\\\
x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\) |"[^\\\x80-\xff\
n\015"]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015"]*) *")[^() <>@, ;:". \\\
[\]\x80-\xff\000-\010\012-\037]*) *<[\040\t]*(? :\([^\\\x80-\xff\n\0
15()]*(? :(? :\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(? :\\[^\x80-\x
ff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]
*)*(? :@[\040\t]*(? :\([^\\\x80-\xff\n\015()]*(? :(? :\\[^\x80-\xff]|\
([^\\\x80-\xff\n\015()]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*
\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(? :[^(\040) <>@, ;:". \\\[\]
\000-\037\x80-\xff]+(?![^(\040) <>@, ;:". \\\[\]\000-\037\x80-\xff])|
\[(? :[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])[\040\t]*(? :\([^\
\\x80-\xff\n\015()]*(? :(? :\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*
(? :\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()
]*)*\)[\040\t]*)*(? :\. [\040\t]*(? :\([^\\\x80-\xff\n\015()]*(? :(? :\
\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(? :\\[^\x80-\xff][^\\\x80-\
xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(? :[^(\040
) <>@, ;:". \\\[\]\000-\037\x80-\xff]+(?![^(\040) <>@, ;:". \\\[\]\000-\
037\x80-\xff])|\[(? :[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])[\
040\t]*(? :\([^\\\x80-\xff\n\015()]*(? :(? :\\[^\x80-\xff]|\([^\\\x80
-\xff\n\015()]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x
80-\xff\n\015()]*)*\)[\040\t]*)*)*(? :,[\040\t]*(? :\([^\\\x80-\xff\
n\015()]*(? :(? :\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(? :\\[^\x80
-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040
\t]*)*@[\040\t]*(? :\([^\\\x80-\xff\n\015()]*(? :(? :\\[^\x80-\xff]|\
([^\\\x80-\xff\n\015()]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*
\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(? :[^(\040) <>@, ;:". \\\[\]
\000-\037\x80-\xff]+(?![^(\040) <>@, ;:". \\\[\]\000-\037\x80-\xff])|
\[(? :[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])[\040\t]*(? :\([^\
\\x80-\xff\n\015()]*(? :(? :\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*
(? :\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()
]*)*\)[\040\t]*)*(? :\. [\040\t]*(? :\([^\\\x80-\xff\n\015()]*(? :(? :\
\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(? :\\[^\x80-\xff][^\\\x80-\
xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(? :[^(\040
) <>@, ;:". \\\[\]\000-\037\x80-\xff]+(?![^(\040) <>@, ;:". \\\[\]\000-\
037\x80-\xff])|\[(? :[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])[\
040\t]*(? :\([^\\\x80-\xff\n\015()]*(? :(? :\\[^\x80-\xff]|\([^\\\x80
-\xff\n\015()]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x
80-\xff\n\015()]*)*\)[\040\t]*)*)*)*:[\040\t]*(? :\([^\\\x80-\xff\n
\015()]*(? :(? :\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(? :\\[^\x80-
\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\
t]*)*)? (? :[^(\040) <>@, ;:". \\\[\]\000-\037\x80-\xff]+(?![^(\040) <>@
, ;:". \\\[\]\000-\037\x80-\xff]) |"[^\\\x80-\xff\n\015"]*(? :\\[^\x80
-\xff][^\\\x80-\xff\n\015"]*) *")[\040\t]*(? :\([^\\\x80-\xff\n\015(
)]*(? :(? :\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(? :\\[^\x80-\xff]
[^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*
(? :\. [\040\t]*(? :\([^\\\x80-\xff\n\015()]*(? :(? :\\[^\x80-\xff]|\([
^\\\x80-\xff\n\015()]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\)
)[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(? :[^(\040) <>@, ;:". \\\[\]\0
00-\037\x80-\xff]+(?![^(\040) <>@, ;:". \\\[\]\000-\037\x80-\xff]) |"[
^\\\x80-\xff\n\015"]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015"]*) *")[\
040\t]*(? :\([^\\\x80-\xff\n\015()]*(? :(? :\\[^\x80-\xff]|\([^\\\x80
-\xff\n\015()]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x
80-\xff\n\015()]*)*\)[\040\t]*)*)*@[\040\t]*(? :\([^\\\x80-\xff\n\0
15()]*(? :(? :\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(? :\\[^\x80-\x
ff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]
*)*(? :[^(\040) <>@, ;:". \\\[\]\000-\037\x80-\xff]+(?![^(\040) <>@, ;:"
. \\\[\]\000-\037\x80-\xff])|\[(? :[^\\\x80-\xff\n\015\[\]]|\\[^\x80
-\xff])*\])[\040\t]*(? :\([^\\\x80-\xff\n\015()]*(? :(? :\\[^\x80-\xf
f]|\([^\\\x80-\xff\n\015()]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015()
]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(? :\. [\040\t]*(? :\([^
\\\x80-\xff\n\015()]*(? :(? :\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]
*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015(
)]*)*\)[\040\t]*)*(? :[^(\040) <>@, ;:". \\\[\]\000-\037\x80-\xff]+(?!
[^(\040) <>@, ;:". \\\[\]\000-\037\x80-\xff])|\[(? :[^\\\x80-\xff\n\01
5\[\]]|\\[^\x80-\xff])*\])[\040\t]*(? :\([^\\\x80-\xff\n\015()]*(? :
(? :\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(? :\\[^\x80-\xff][^\\\x
80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*) *>)

email-opt.pl Ȳ κ ߷ȴ ũƮԴϴ. Ȳ κ ߷ Դϴ.

# $email  ùٸ  ּҳ Ѵ

$esc         = '\\\\';               $Period      = '\. ';
$space       = '\040';               $tab         = '\t';
$OpenBR      = '\[';                 $CloseBR     = '\]';
$OpenParen   = '\(';                 $CloseParen  = '\)';
$NonASCII    = '\x80-\xff';          $ctrl        = '\000-\037';
$CRlist      = '\n\015';
$qtext       = qq/[^$esc$NonASCII$CRlist\"]/;
$dtext       = qq/[^$esc$NonASCII$CRlist$OpenBR$CloseBR]/;
$quoted_pair = qq<${esc}[^$NonASCII]>;
$ctext       = qq<[^$esc$NonASCII$CRlist()]>;
$Cnested     = qq<$OpenParen$ctext*(? :$quoted_pair$ctext*) *$CloseParen>;
$comment     =
    qq<$OpenParen$ctext*(? :(? :$quoted_pair|$Cnested) $ctext*) *$CloseParen>;
$X           = qq<[$space$tab]*(? :${comment}[$space$tab]*) *>;
$atom_char   = qq/[^($space) <>\@, ;:\". $esc$OpenBR$CloseBR$ctrl$NonASCII]/;
$atom        = qq<$atom_char+(?!$atom_char) >;
$quoted_str  = qq<\"$qtext*(? :$quoted_pair$qtext*) *\">;
$word        = qq<(? :$atom|$quoted_str) >;
$domain_ref  = $atom;
$domain_lit  = qq<$OpenBR(? :$dtext|$quoted_pair) *$CloseBR>;
$sub_domain  = qq<(? :$domain_ref|$domain_lit) $X>;
$domain      = qq<$sub_domain(? :$Period$X$sub_domain) *>;
$route       = qq<\@$X$domain(? :, $X\@$X$domain) *:$X>;
$local_part  = qq<$word$X(? :$Period$X$word$X) *>;
$addr_spec   = qq<$local_part\@$X$domain>;
$route_addr  = qq[<$X(? :$route)? $addr_spec>];
$phrase_ctrl = '\000-\010\012-\037';
$phrase_char =
   qq/[^() <>\@, ;:\". $esc$OpenBR$CloseBR$NonASCII$phrase_ctrl]/;
$phrase      =
    qq<$word$phrase_char*(? :(? :$comment|$quoted_str) $phrase_char*) *>;
$mailbox     = qq<$X(? :$addr_spec|$phrase$route_addr) >;

print "ok\n" if $email =~ /^$mailbox$/o;

perl5. 6.0 perl ǥ ׽Ʈ ڸƮ κ, ũƮ $Cnested $comment Թ ǵǰ ־ ׽Ʈ ǥ ǰ ֽϴ. 2 Թ Ͽ ϴ ּ ǥ Ȯϰ ִ ˴ϴ.

use re 'eval';
$comment     =
  qr<$OpenParen$ctext*(? :(? :$quoted_pair|(??{$comment})) $ctext*) *$CloseParen>;

ٸ, ⼭ ϰ ִ ǥ (??{ code }) ̹Ƿ ǰų 𸣹Ƿ ǰ ʿմϴ. , use re 'eval'; ϰ Ƿ, ʿ䰡 ֽϴ.  ʿ䰡 ִ ޴ о ּ. ּ ġ no re 'eval'; δ õմϴ.

ּҰ ùٸ  Ϸ Email::Valid Ǵ Mail::CheckUser ϴ ٰ մϴ. ϸ, ּҰ RFC 822 ִ ùٸ  ƴϰ, ּҰ ȿѰ  ֽϴ. ٸ, ͳݿ ӵǰ ʿ䰡 ֽϴ. ڼ ޴ о ּ.

׷, ּҶ ϴ From ִ Ϸ ϰ,RFC 822 mailbox μ ǵǰ ֽϴ. mailbox ִ ij κ ּҸ ϴ ϴ ̶ Դϴ. ׷ ʿϰ Ǵ mailbox ƴϰ, addr-spec (). mailbox addr-spec  ϱϰ ϸ, , OHZAKI Hiroki <ohzaki@din.or.jp> ϴ mailbox Դϴٸ addr-spec ƴմϴ. ohzaki@din.or.jp ϴ addr-spec ִκ ϼǴ mailbox ˴ϴ.

ű⼭ ũƮ , ij κ ּҸ Ѵ ϱ addr-spec ǥ Ͽ ϴ.

#  ּ  ǥ $mail_regex

$esc         = '\\\\';               $Period      = '\. ';
$space       = '\040';
$OpenBR      = '\[';                 $CloseBR     = '\]';
$NonASCII    = '\x80-\xff';          $ctrl        = '\000-\037';
$CRlist      = '\n\015';
$qtext       = qq/[^$esc$NonASCII$CRlist\"]/;
$dtext       = qq/[^$esc$NonASCII$CRlist$OpenBR$CloseBR]/;
$quoted_pair = qq<${esc}[^$NonASCII]>;
$atom_char   = qq/[^($space) <>\@, ;:\". $esc$OpenBR$CloseBR$ctrl$NonASCII]/;
$atom        = qq<$atom_char+(?!$atom_char) >;
$quoted_str  = qq<\"$qtext*(? :$quoted_pair$qtext*) *\">;
$word        = qq<(? :$atom|$quoted_str) >;
$domain_ref  = $atom;
$domain_lit  = qq<$OpenBR(? :$dtext|$quoted_pair) *$CloseBR>;
$sub_domain  = qq<(? :$domain_ref|$domain_lit) >;
$domain      = qq<$sub_domain(? :$Period$sub_domain) *>;
$local_part  = qq<$word(? :$Period$word) *>;
$addr_spec   = qq<$local_part\@$domain>;
$mail_regex  = $addr_spec;

ũƮ, ũƮκ, ߿ ڸƮ ̽ , Ȳ κ Դϴ. ũƮκ 䱸 addr-spec Ͽ Ǿϴ.

(? :[^(\040) <>@, ;:". \\\[\]\000-\037\x80-\xff]+(?![^(\040) <>@, ;:". \\
\[\]\000-\037\x80-\xff]) |"[^\\\x80-\xff\n\015"]*(? :\\[^\x80-\xff][
^\\\x80-\xff\n\015"]*) *")(? :\. (? :[^(\040) <>@, ;:". \\\[\]\000-\037\x
80-\xff]+(?![^(\040) <>@, ;:". \\\[\]\000-\037\x80-\xff]) |"[^\\\x80-\
xff\n\015"]*(? :\\[^\x80-\xff][^\\\x80-\xff\n\015"]*) *"))*@(? :[^(\0
40) <>@, ;:". \\\[\]\000-\037\x80-\xff]+(?![^(\040) <>@, ;:". \\\[\]\000
-\037\x80-\xff])|\[(? :[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])
(? :\. (? :[^(\040) <>@, ;:". \\\[\]\000-\037\x80-\xff]+(?![^(\040) <>@,;
:". \\\[\]\000-\037\x80-\xff])|\[(? :[^\\\x80-\xff\n\015\[\]]|\\[^\x
80-\xff])*\]))*

ϰ ̰ ϴ ũƮ ˴ϴ.

$mail_regex =
q{(? :[^(\040) <>@, ;:". \\\\\[\]\000-\037\x80-\xff]+(?![^(\040) <>@, ;:". \\\\} .
q{\[\]\000-\037\x80-\xff]) |"[^\\\\\x80-\xff\n\015"]*(? :\\\\[^\x80-\xff][} .
q{^\\\\\x80-\xff\n\015"]*) *")(? :\. (? :[^(\040) <>@, ;:". \\\\\[\]\000-\037\x} .
q{80-\xff]+(?![^(\040) <>@, ;:". \\\\\[\]\000-\037\x80-\xff]) |"[^\\\\\x80-} .
q{\xff\n\015"]*(? :\\\\[^\x80-\xff][^\\\\\x80-\xff\n\015"]*) *"))*@(? :[^(} .
q{\040) <>@, ;:". \\\\\[\]\000-\037\x80-\xff]+(?![^(\040) <>@, ;:". \\\\\[\]\0} .
q{00-\037\x80-\xff])|\[(? :[^\\\\\x80-\xff\n\015\[\]]|\\\\[^\x80-\xff])*} .
q{\])(? :\. (? :[^(\040) <>@, ;:". \\\\\[\]\000-\037\x80-\xff]+(?![^(\040) <>@,} .
q{;:". \\\\\[\]\000-\037\x80-\xff])|\[(? :[^\\\\\x80-\xff\n\015\[\]]|\\\\[} .
q{^\x80-\xff])*\]))*};

ּ ǥ $mail_regex () ,$email ùٸ ּҳ Ϸ ϴ.

# $email  ùٸ  ּ(addr_spec) Ѵ

if ($email ! ~ /^$mail_regex$/o) {
  print "  ּԴϴ \n";
}

Դϴٸ, DoCoMo(i-mode) J-Phone(J-Sky) ּҷμ irregular. @docomo.ne.jp () @ . (Ǹ) ͵ ֽϴ. ׷, ̰ RFC 822 ʴ ּԴϴ. @ local-part κп . (Ǹ) () ݵ ٸ ڿ ˴ϴ. , . (Ǹ) () ο ִ @ ִ ּҶ ˴ϴ. DoCoMo(i-mode) J-Phone(J-Sky) ϱȯ̸ ϴٸ, ׷ ؾ ϴ ƴմϴ.

ž

Ϻ Ѵ

perl ũƮ EUC-JP

perl Ϻ Ϸ Ǵ ֽϴ. ֳϸ, Ϻ ڵ忡 perl Ư ǹ̷μ ؼ ڰ ԵǾ ֱ Դϴ. , perl ũƮ JIS ׸ ٰ մϴ.

$str = " TEST  ";
$str =~ s/ TEST / ׽Ʈ /;  # JIS  SJIS  Ÿ
print $str, "\n";
̰ ϴ. unmatched () in regexp Ǿ Դϴ. ֳϸ, escape sequence ESC ( B ԵǾ ֱ ؼ(),( ׷ȭ ȣμ ؼ Դϴ. , ݰ ȣ ) () ȣ ϰ ʴٰ ϴ Դϴ. ׷ ũƮ SJIS ϱ. ̹ unmatched [] in regexp () Ǿ Դϴ. ֳϸ SJIS 桹 ڵ 0x8F 0x5B ̸,0x5B ϴ ASCII [ ڵ̱ Դϴ.

ű⼭ SJIS 쿡 ǥ ʰ, κ \Q \E ̿ ξ ̽ Ѵٰ ϴ ȸ ֽϴ.

$str = " TEST  ";
$str =~ s/\Q  TEST \E/ ׽Ʈ /;  # ̰ SJIS  ?
print $str, "\n";
׷ , ̰ ϴ. ֳϸ, SJIS 10 ڵ 0x8F 0x5C (̾),0x5C ϴ ASCII \ ڵ̱ (), 1Ʈ° Ư ǹ̷μ ؼϷ ϱ ()Դϴ. \ () 1Ʈ° escape sequence , 10 2Ʈ° \ () õǰ ˴ϴ.

̿ SJIS 2Ʈ° \ ڰ ֱ ؼ() ڰ ϴ. 2Ʈ° @ ڿ 迭̶ ؼǾ ֽϴ. 2Ʈ° \ ڿ ؼ, ڷ \ () ȸ ֽϴٸ,2Ʈ° @ ڿ ؼ ٸ ȸ ˴ϴ. ٿ, SJIS 2Ʈ° \ ڴ ҬظԱ10øǥ鰴̳ȸغкä ¼üɸ Դϴ. ,2Ʈ° @ ڴ ̽ ⱼļ÷˿ȣּġơ Դϴ. ̷ ̿ܿ SJIS Ǵ ڰ ֽϴ.

ٿ SJIS 쿡 \Q \E ̿ ξ ̽ Ѵٰ ϴ ȸ ߽ϴٸ, ʽϴ. , ũƮ ּ.

if ($str =~ /\Q$keyword\E/) {
  print "  \n";
}

ũƮ , Ű $keyword () \Q \E ׸ ̿ θ ʰ ġ ִٰ ϴ ̾߱Ⱑ ֽϴ. и δ ʽϴٸ, SJIS $str = '׽Ʈ'; $keyword = 'X'; ġ ϸ() ϴ. ̰ SJIS ڵ尡 0x83 0x58 ̸, 0x58 ϴ ASCII X ڵ ֱ Դϴ. , $str = 'cab'; $keyword = '='; ϴ. ̰ cab ϴ ij ڵ 0x82 0x83 0x82 0x81 0x81 0x83 0x82 0x82 ؼ, ΰ ߳ ġ =𡹶 ϴ ij ڵ 0x83 0x82 0x81 0x81 0x83 0x82 Դϴ.

perl Ϻ ϱ ϳ jperl Ѵٰ ϴ Դϴ. jperl perl ġ , Ϻ ֵ() Դϴ. Windows jperl (Ű Ű)κ Լ ֽϴ.

http://www.shonanblue.ne.jp/~kipp/perl/jperl/index.html

׷ ũƮ EUC-JP ׸ . EUC-JP ׸ 쿡 Դϴ. ֳϸ,EUC-JP JIS SJIS perl Ư ǹ̷μ ؼ ڰ Ե ʱ Դϴ. perl Ϻ Ϸ perl ũƮ EUC-JP ׸ Դϴ. Ͽ,EUC-JP ׸ ũƮ ϰ ֽϴ.

EUC-JP ġ ص SJIS ߸ 찡 ֽϴ. ̰Ϳ ؼ ùٸ ġŲ () ּ.

ž

ڵ带 EUC-JP ȯ óѴ

perl ũƮ EUC-JP ٰ ص, Է Ϻ ڵ尡 SJIS JIS ϴ. ű⼭  ó EUC-JP ȯϰ մϴ. perl ũƮ EUC-JP , ڵ尡 EUC-JP Ϻ óѴ ϴ , perl Ϻ Ͼ Դϴ.

Է Ϻ ڵ尡 EUC-JP ƴ , Ǵ, ڵ带 𸣴 쿡, ڵ带 jcode.pl ( ī羾)() EUC-JP ȯ ݴϴ. $str EUC-JP () ȯϷ ϴ.

# $str  EUC-JP  ȯѴ

require 'jcode.pl';

jcode::convert(\$str, 'euc');

'euc' κ 'sjis' 'jis' ϸ, SJIS JIS ȯ ֽϴ. , Է Ϻ ڵ尡 $code ̴ٰ ˰ ִ 쿡, ϴ ο ڵ Ǻ ʰ ֽϴ.

#  ڵ尡 $code  $str  EUC-JP  ȯѴ

require 'jcode.pl';

jcode::convert(\$str, 'euc', $code);

ڵ带 Ѵ ڵ Ǻ е ÷ 䱸 $code ϰ մϴ.

Դϴٸ, my ؼ, ׷κ ȯϷ ϴ ǼԴϴ.

# my   ȯ   ٸ 

require 'jcode.pl';

my $str = 'my    ׷κ ';

jcode::convert(*str, 'euc');
my ׷κ , ̰δ ȯ ϴ. my ϵ ۷ 䱸 Ƿ, ũƮ ׻ \$str Ͼ ƴ Դϴ.

jcode.pl () ҿ ֽ ֽϴ.

http://www.srekcah.org/jcode/
ֽ jcode.pl-2. 13 Դϴ. ̰ jcode.pl ̸ մϴ. jcode.pl jcode.pl ȿ ֽϴ. 𸣸, ī þ Ѵ jcode.pl ؼ 𸨴ϴ.

Jcode.pm - jcode.pl İ (ڰź) () ͵ ǰ ֽϴ. Jcode.pm UNICODE ϰ ֽϴٸ, Ϸ jcode.pl īϴ δ ȵǾ, иϰ ν ʿ䰡 ֽϴ. ٿWindows perl ̴ ActivePerl 5.6, Ű (Ű Ű ) ǰ ֽϴ.

http://www.shonanblue.ne.jp/~kipp/perl/packages/5. 6/index.html

2.10 jcode.pl () thread ȿϰ Ǿ ִ perlϴ ϴ. thread ȿϰ Ǿ ִ perl Ư $_ () @_ ()ŰĮ ˴ϴ. ŰĮ my Դϴ. ŰĮ ϴ local ϴ ϴٸ, 2.10 jcode.pl Լ μ local ׷κ *_ Ϸ ϰ ֱ ؼ() ʽϴ. ֽ jcode.pl Jcode.pm () thread ȿϰ Ǿ ִ perl մϴ.

perl thread ȿϰ Ǿ ִ  Ϸ perl -V Է մϴ. usethreads=undef ǰ ȿ Ǿ Ƿ jcode.pl Ƚϰ ֽϴ. perl5. 005 perl thread ϴ. , thread ȿϰ Ǿ ־ 쿡 2.10 jcode.pl ,Ư $_ @_ ŰĮ Ǿ ְԵ ũƮ ʿ䰡 ֽϴ.

ž

ڵ带 Ѵ

# $str   ڵ带 Ѵ

require 'jcode.pl';

($match, $code) = jcode::getcode(\$str);
$code = 'euc' if $code eq undef and $match > 0;
jcode.pl getcode Լ մϴ. $code 'euc' 'sjis','jis' ׷ٰ ϴ ij  ֽϴ. ڼϰԴ jcode.pl о ּ.

⼭ ǰ ʿ , ڵ带 Ȯϰ ϴ Ϳ Ѱ谡 ִ ϴ Դϴ. SJIS (2 ) Ϻγ SJIS ݰ īŸī 2 EUC-JP 1 ʽϴ. , ڵ尡 EUC-JP ΰ SJIS ɼ ־, Ǵ jcode::getcode() undef () ݴϴ. ٸ, ϰԴ Ǵ ٰ ־ ݰ īŸī Ե () EUC-JP ̹Ƿ, ũƮ undef ƴϰ EUC-JP ϰ ֽϴ.

jcode::getcode() SJIS ݰ īŸī ʰ ϰ ֽϴ. , SJIS Ǵ ִ ݰ īŸī ԵǾ ִ EUC-JP ߸ ֽϴ. ű⼭, е ø ֽϴ.

# $str   ڵ带 Ѵ

require 'jcode.pl';

($match, $code) = jcode::getcode(\$str);
$code = 'euc' if $code eq undef and $match > 0;

$ascii = '[\x00-\x7F]';
if ($code eq 'euc') {
  if ($str ! ~ /^(? :$jcode::re_euc_c|$jcode::re_euc_kana|
                   $jcode::re_euc_0212|$ascii) *$/ox) {
    if ($str =~ /^(? :$jcode::re_sjis_c|$jcode::re_sjis_kana|$ascii) *$/o) {
      $code = 'sjis';
    }
  }
}
̰ SJIS EUC-JP ߸ ɼ ֽϴٸ, ŭ ó ð ɷ ؾ ȵ˴ϴ. ̿ ڵ е ÷ 䱸 $code () ڵ带 ȯ 쿡 ̿ ֽϴ. ڵ ȯ ؼ ڵ带 EUC-JP () ȯ óѴ .
ž

ڰ ԵǾ ִ Ѵ

$str EUC-JP ϴ ̹Ƿ, ʿϸ ̸ EUC-JP ȯ ּ. ڵ ȯ ؼ ڵ带 EUC-JP () ȯ óѴ .

# $str   (ݰ īŸī  ʴ´) ԵǾ ִ Ѵ

if ($str =~ /[\xA1-\xFE][\xA1-\xFE]/) {
  print "ԵǾ ִ \n";
}
ڴ JIS X 0208 JIS X 0212 ̹Ƿ, ݰ īŸī JIS X 0201 Ÿī ʽϴ. ڰ ԵǾ ִ  Ϸ ,JIS X 0208 JIS X 0212 κ̸, ASCII JIS X 0201 Ÿī Ÿ ʴ /[\xA1-\xFE][\xA1-\xFE]/ մϴ.
# $str  ݰ īŸī ԵǾ ִ Ѵ

if ($str =~ /\x8E/) {
  print "ԵǾ ִ \n";
}

ݰ īŸī ԵǾ ִ  Ϸ ,EUC-JP /\x8E/ ϴ ͸ ֽϴ.

# $str  ASCII ̿ܰ ԵǾ ִ Ѵ

if ($str =~ /[\x8E\xA1-\xFE]/) {
  print "ԵǾ ִ \n";
}

ASCII ̿ ڰ ԵǾ ִ Ϸ , /[\x8E\xA1-\xFE]/ ϴ ֽϴ. \x8E () JIS X 0201 Ÿī 1Ʈ° , [\xA1-\xFE] JIS X 0208 1Ʈ°ΰ, JIS X 0212 2Ʈ° ϱ , ASCII ̿ ڰ ԵǾ ִ ֽϴ.

$str EUC-JP ȭ jcode.pl () ֽϴ. jcode.pl ڵ带 Ѵ ũƮ $str ڵ带 undef ASCII ̿ ڴ Ե ʴٰ ֽϴ. ݴ ϸ,undef ƴ ASCII ̿ ڰ ԵǾ ִٰ ֽϴ. , Ȳ $match ʰ , ڱ undef ȭ ϴ ߸Ǿ ֽϴ.

# $str  ASCII ̿ܰ ԵǾ ִ    ٸ 

require 'jcode.pl';
$code = jcode::getcode(\$str);

if ($code eq undef) {
  print "ASCII ܴ̿ Ե  \n";
  print " Ǵ ߸ \n";
}
jcode::getcode() EUC-JP SJIS ɼ ־, Ǵ undef () ݴϴ. ڵ带 Ѵ $match undef 츦 ó ʿ䰡 ֽϴ.
ž

ڰ ߴܵǰ ִ Ѵ

$str EUC-JP ϴ ̹Ƿ, ʿϸ ̸ EUC-JP ȯ ּ. ڵ ȯ ؼ ڵ带 EUC-JP () ȯ óѴ .

# $str   ڰ ߴܵǰ ִ Ѵ

if ($str =~ /\x8F$/ or $str =~ tr/\x8E\xA1-\xFE// % 2) {
  print " ڰ ߴܵǰ ִ \n";
}

EUC-JP ڰ ߴܵ ɼ ִ ,JIS X 0201 Ÿī(ݰ īŸī)()JIS X 0208( ) JIS X 0212( )Դϴ. JIS X 0212 ǥǾ ʰ \x8F ۵˴ϴ. $str \x8F ׸ ־ , ,JIS X 0212 1Ʈ° ߴܵǰ ־ 츦 Ÿ ֽϴ. JIS X 0201 Ÿī JIS X 0208 1Ʈ° ߴܵǰ ־ JIS X 0212 2Ʈ° ߴܵǰ ־ Դϴ. tr/\x8E\xA1-\xFE// $str ,JIS X 0201 Ÿī JIS X 0208 1Ʈ° 2Ʈ°, JIS X 0212 2Ʈ° 3Ʈ° ֽϴ. Ȧ ڰ ߴܵǰ ִ ֽϴ.

ž

ڸ ݰڷ ȯѴ

$str EUC-JP ϴ ̹Ƿ, ʿϸ ̸ EUC-JP ȯ ּ. ڵ ȯ ؼ ڵ带 EUC-JP () ȯ óѴ .

# $str  ڸ ݰڷ ȯѴ

require 'jcode.pl';

jcode::tr(\$str, '0-9A-Za-z', '0-9A-Za-z');

jcode.pl tr Լ մϴ. Լ ڿ ߴ tr Դϴ. ڼϰԴ jcode.pl о ּ. ⺻ tr ׷, ̿ܿ ̽ ݰ ̽ ϴ ȯ ϰ ֽϴ.

# $str   ̽ ݰ ̽  ȯѴ

require 'jcode.pl';

jcode::tr(\$str, '()ߣ', ' ()_@-');

ݴ, 1 μ 2 μ ݴ ϸ, ݰ ڸ ڷ ֽϴ. ݰ īŸī īŸī ȣ ȯ ؼ ݰ īŸī īŸī ȯѴ .

ž

ݰ īŸī īŸī ȯѴ

$str EUC-JP ϴ ̹Ƿ, ʿϸ ̸ EUC-JP ȯ ּ. ڵ ȯ ؼ ڵ带 EUC-JP () ȯ óѴ .

# $str  ݰ īŸī  īŸī ȯѴ

require 'jcode.pl';

jcode::h2z_euc(\$str);

jcode.pl h2z_euc Լ մϴ.

ž

ùٸ ġŲ

$str $pattern EUC-JP ϴ ̹Ƿ, ʿϸ ̸ EUC-JP ȯ ּ. ڵ ȯ ؼ ڵ带 EUC-JP () ȯ óѴ .

perl Ϻ ϴ 쿡 ũƮ EUC-JP , ڵ尡 EUC-JP Ϻ óѴٴ Ͼ ̴ٰ ϴ perl ũƮ EUC-JP ڵ带 EUC-JP () ȯ óѴ ߽ϴ. ׷, װ͸δ Ͼ ֽϴ. , ũƮ ϸ() ߸ ϴ.

# ߸    

$str = '̰ ׽ƮԴϴ';
$pattern = 'ȣ';

if ($str =~ /$pattern/) {
  print "  \n";
}

̷ Ͼ ° ϸ(),EUC-JP ڵ 0xA5 0xB9 , Ʈ 0xA5 0xC8, ȣ 0xB9 0xA5 (̾), Ȯ ľ Ѱ κ ȣ ǹǷ ϴ. ̿ ߳ ҿ 쿡 ϴ.

# $str  $pattern  ùٸ  Ų

$ascii = '[\x00-\x7F]';
$twoBytes = '[\x8E\xA1-\xFE][\xA1-\xFE]';
$threeBytes = '\x8F[\xA1-\xFE][\xA1-\xFE]';

if ($str =~ /^(? :$ascii|$twoBytes|$threeBytes)*? (? :$pattern)/) {
  print "  \n";
}

̷ Ǵ մϴ. ʰ ߸ ũƮ /$pattern/ ϰ Ű ߱ ߳ ҿ Ƚϴ. ű⼭, ߳ ҿ ʰ Ϸ ,$pattern Ϻ ڰ ΰ ־, Ŀ $pattern () ϴ ʿ䰡 ֽϴ. EUC-JP 1 ϴ ASCII, JIS X 0201 Ÿī(ݰ īŸī) JIS X 0208( ), JIS X 0212( )Դϴ. ̰ ǥ Ÿ´ (? :$ascii|$twoBytes|$threeBytes) κԴϴ. ڰ ij ηκ ΰ ӵ Ŀ $pattern ´ٰ ϴ ǥ ũƮԴϴ.

ǥ ڸ Ÿ . (Ǹ)() մϴٸ, Ϻ ij ο Ī, . (Ǹ) () ; Ҹ (? :$ascii|$twoBytes|$threeBytes) ׷ٸ Ϳ ˴ϴ. ũƮ /$pattern/ /^. *? (? :$pattern)/ () ϸ ũƮ Ǵ ͵ ֽ ִ ƴұ.

Ϻ ij ο ؼ ùٸ Ű μ ݱ ó EUC-JP 1 иϰ ǽ ǥ ٰ ϴ ̿ܿ, ̸ Ų ڷ ܶ ֵ()ܶ ڸ ϴ ֽϴ. üδ , ǰ ִ Ϻ ij $str Ű ϰ ִ $pattern ο ܶ ڸ ó ϰ Ī մϴ. ũƮ ܶ ڷμ \000 () ϰ ֽϴ.

# ܶ ڸ  ùٸ  Ų(ſ ʴ)

$twoBytes = '[\x8E\xA1-\xFE][\xA1-\xFE]';
$threeBytes = '\x8F[\xA1-\xFE][\xA1-\xFE]';

$pattern =~ s/($twoBytes|$threeBytes) /$1\000/og;
$str =~ s/($twoBytes|$threeBytes) /$1\000/og;

if ($str =~ /$pattern/) {
  print "  \n";
}

Ű ܶ ڸ ó ϴ , ǥ ü ֽϴ. ˱ ϴٸ, Ƹ κ ܶ ڸ ʴ ũƮ ӵ .

2() Ư¡ ڽϴ. ܶ ڸ ʴ , ó ٷ ġ ִ. ׷, ġ װ ǥ ϱ ʴ. ܶ ڸ , ̸ ij ü ܶ  ִ ó ʿ䰡 ִ. ٸ, ġ ü ǥ ϰ DZ () .

׷ ϸ  ɱ ҽϴ. ġ ʾҴ , ij ü ˻ ϴ Ϳ ˴ϴٸ, ġũ Ҵµ, ܶ ڸ ʴ´ е ( 15)Դϴ. ϴ 쿡, ij ߿ ˻ Ƿ, ij ü ؼ ݵ ó Ǵ ܶ ڸ ߴ ͵ ϴ. ᱹ, ܶ ڸ ʴ ǥ ϰ ŭ ġ ü ʾϴٸ, ܶ ڸ ϴ , ȵ ܶ  ִ ó ʹ ʾ ġ ü ӵ ʾҴ ϴ.

κ ϸ(), ; ()κ ϴ ͸ ó иϰ ܶ ڸ ʴ´ ũƮ ٰ ֽϴ. ܶ ڸ μ, ó ġ ӵ ִ ŭ ̳ ij ο ؼ ġ ϴ Դϴ. , ̰͵ ӵ ؼ ȯ濡 ϴ ̹̾߱Ƿ, ڽ ȯ濡 Դϴ.

, Ϻ ij ùٸ ġȯϴ մϴ. ũƮ ߸ ġȯ ٰ ϴ ߴ Դϴ.

# ߸ ġȯ  

$str = '̰ ׽ƮԴϴ';
$pattern = 'ȣ';
$replace = '';

$str =~ s/$pattern/$replace/g;

ùٸ ġȯ ֽϴ.

# $str  $pattern  $replace  ùٸ ġȯѴ

$ascii = '[\x00-\x7F]';
$twoBytes = '[\x8E\xA1-\xFE][\xA1-\xFE]';
$threeBytes = '\x8F[\xA1-\xFE][\xA1-\xFE]';

$str =~ s/\G((? :$ascii|$twoBytes|$threeBytes)*? )(? :$pattern) /$1$replace/g;

ũƮ ⺻ ,EUC-JP 1 иϰ ǽ ǥ ϴ. ٸ, ų (ŭ) ٸ $1 () \G ϰ ִ ()Դϴ. $1 () ϴ , ġȯϴ κ ų $pattern ִ ڵ Բ Ų DZ (), κ ġȯ ʰ ״ ʿ䰡 ֱ Դϴ. ű⼭ $pattern κп شǴ ǥ (? :$ascii|$twoBytes|$threeBytes)*? () ȣ ѷ $1 ׸ ֵ() ϰ ֽϴ.

\G մϴ. \G () ϴ g ٿ ֱ ()Դϴ. g ,  (ŭ)̶ ʿ, ,ۿ ġȯ ʴ 쵵 ʿ ϴ. g ̴ ׸ξ, \G ij ο Ѵ ^ ٲ ֽϴ. ݴ ϸ, g ٿ $str $pattern () ġȯϰ , ij ο Ѵ ^ () ٰ ϴ Դϴ. \G () g ٿ ִ Ǯ, ġ ġ մϴ. ,\G ó ^ , κʹ $pattern ڷ մϴ. ˱ ϰ ϸ(),\G  ݺ Ϸ ϰ ִ κ ο Ѵٰ ֽϴ. \G () ϴ , ߳ ġ $pattern ϴ ˴ϴ.

ġȯ 쿡 ܶ ڸ ùٸ ġȯϴ ֽϴ.

# ܶ ڸ  ùٸ ġȯŲ(ſ ʴ)

$twoBytes = '[\x8E\xA1-\xFE][\xA1-\xFE]';
$threeBytes = '\x8F[\xA1-\xFE][\xA1-\xFE]';

$pattern =~ s/($twoBytes|$threeBytes) /$1\000/og;
$str =~ s/($twoBytes|$threeBytes) /$1\000/og;

$str =~ s/$pattern/$replace/g;
$str =~ tr/\000//d;
# $str =~ s/($twoBytes|$threeBytes) \000/$1/og;

⺻ ܶ ڸ ùٸ Ű ϴ. ٸ, ų (ŭ) ޸, ġȯ Ŀ ܶ ڸ ʿ䰡 ֽϴ. ũƮ ܶ ڿ \000 () ϰ ־, ġȯ Ŀ ܶ ڸ tr ϰ ֽϴ. ׷ , $str ȿ ó ԵǾ ־ \000 Բ ϴ. tr ִ $str ȿ ܶ ڿ \000 Ե ʴٰ ϴ ʿմϴ. , $str ȿ \000 ԵǾ 𸣴 쿡 tr () ܶ ڸ ϴ ߾, $str =~ tr/\000//d; $str =~ s/($twoBytes|$threeBytes) \000/$1/og; () մϴ.

ӵ 2() ҽϴ. ij ο ؼ ġȯϴ () 쿡, ܶ ڸ ʴ´ е ( 35)Դϴ. ڸ ġȯ ʿ䰡 ִ ij ־ 쿡, ܶ ڸ ʴ 4 Դϴ. , ܶ ڸ ϴ ó tr ʾҴ 쿡 ǵ ̰ . ᱹ, ġȯ 쿡 ܶ ڸ ϴ , ó ó ð ɷ ٰ ϴ ֽϴ. ӵ ؼ ȯ濡 ϴ ̹̾߱Ƿ, ڽ ȯ濡 ǰ ̶ ϴ ͵ ϴ.

׷, ̾߱⿡ $pattern Perl ùٸ ǥ̶ ϴ ϴ. ׷ϱ, ȣ ( Ű 쿡 \( ̽ ʿ䰡 ֽϴ. CGI  ־, Է ij ϴ ˻ϰ , Էµ ij ǥμ ؼϴ ƴ϶, ij ü ˻ϰ 찡 κ. ׷ , $pattern μ ġ ϸ(), ȣ ( ԷµǾ ǥμ ùٸ Ǿ ϴ. ű⼭ ǥ Ư ǹ̷μ ؼǴ ȣŸ ij() ̽ ġų ʿ䰡 ֽϴ.

(), Է $keyword ؼ, ݱ ũƮ $pattern κ \Q$keyword\E , ġ ,

if ($str =~ /^(? :$ascii|$twoBytes|$threeBytes)*? \Q$keyword\E/) {
  print "  \n";
}
ġȯ ,
$str =~ s/\G((? :$ascii|$twoBytes|$threeBytes)*? ) \Q$keyword\E/$1$replace/g;
() մϴ. \Q κ \E Ÿ ijͰ õǰ ˴ϴ.

ӵ ø 1 Ӵϴ. ݱ ó, Ϻ ij ο ؼ ùٸ Űų ġȯѴ () ǥ ʿ䰡 ֽϴ. , ϰ ŭ ӵ ʾ ϴ. ̰, 뷮 ߿ ˻ϰų ġȯϰų ϴ 쿡 ſ ð ɸ Ǿ ǹմϴ.

⼭ . 뷮 ߿ ˻ , κ ʴ´Դϴ. , ʱ ùٸ ų ʿ ϴ. ű⼭ $pattern ˻ϰ , ϴ κ ӵ ø ϴ.

if ($str =~ /$pattern/) {
  if ($str =~ /^(? :$ascii|$twoBytes|$threeBytes)*? (? :$pattern)/) {
    print "  \n";
  }
}

$keyword , /\Q$keyword\E/ ϴ ǥ ʰ index Լ մϴ.

if (index($str, $keyword) > -1) {
  if ($str =~ /^(? :$ascii|$twoBytes|$threeBytes)*? \Q$keyword\E/) {
    print "  \n";
  }
}

index Լ ǥ ӵ е , ǥ ƴϰ,index Լ ° ׻ ϰ Դϴ.

ݱ EUC-JP ƴϰ, SJIS 쿡 ֽϴ. SJIS 쿡 SJIS 1 Ѵ ǽ ǥ ˴ϴ. SJIS 1 ؼ, ǥ .

,EUC-JP perl5. 005 Ķ ϴ (), κ 쿡 ݱ ӵ , ϱ ֽϴ. Ͽ մϴ.

# EUC-JP  perl5. 005   

$eucpre = qr{(? <! \x8F)};
$eucpost = qr{
    (? =                         
     (? :[\xA1-\xFE][\xA1-\xFE])* # JIS X 0208  0  ̻ ӵǾ
     (? :[\x00-\x7F\x8E\x8F]|\z)  # ASCII, SS2, SS3 Ǵ 
    )
 }x;

if ($str =~ /$eucpre(? :$pattern) $eucpost/) {          #  ġ
  print "  \n";
}

if ($str =~ /$eucpre\Q$keyword\E$eucpost/) {      # Ű ġ
  print "  \n";
}

$str =~ s/$eucpre(? :$pattern) $eucpost/$replace/g;     #  ġȯ

$str =~ s/$eucpre\Q$keyword\E$eucpost/$replace/g; # Ű ġȯ

쿡 ص,$eucpre $eucpost ̿ δ ŭ ˴ϴ. ǥб(lookbehind) ()(lookahead) ϰ ֽϴ. б (? <regex), (? =regex) ׷ٰ ϴ ǥ ˴ϴ. ũƮ б б (? <! regex) () ϰ ֽϴ.

Ű ǥ ߳ ġ ƴ () б ϰ ֽϴ. üδ, б κп JIS X 0212 2Ʈ° ߳ ϰ ְ ° üũϰ ֽϴ. JIS X 0212 2Ʈ° ϰ ־ , κ JIS X 0212 1Ʈ°, , \x8F ְ ˴ϴ. ׷, б⿡ \x8F ƴ ǰ Ƿ, JIS X 0212 2Ʈ° ߳ ϴ ϴ.

,JIS X 0208 2Ʈ° ߳ () JIS X 0212 3Ʈ° ߳ ġ 쿡 üũ κп ϰ ֽϴ. , ġκ ߳ , κп ʰ ˴ϴ. κ κ ڷ ùٸ EUC-JP ij ְ ()  üũϰ ֽϴ. üδ, κ ڷκ, JIS X 0208̿ , ùٸ JIS X 0208 ڰ ӵǰ ִ  üũϰ ֽϴ.

б⸸ ùٸ ų ֽϴ. () б ̳ ü ij ʴ´ 0 ǥԴϴ. , ġ ȯ ϴ 쿡 ġȯ ij ξȿ $eucpre $eucpost κ $1 () ʿ ʰ ˴ϴ.

ž

( ̽) Ѵ

# $str    ( ̽) Ѵ
$str =~ s/^(? :\s|$Zspace) +//o; # $str  EUC-JP  
$str =~ s/^(? :\s|$Zspace_sjis) +//o; # $str  SJIS  

# $str    ( ̽) Ѵ
$str =~ s/^($character*? )(? :\s|$Zspace) +$/$1/o; # $str  EUC-JP  
$str =~ s/$eucpre(? :\s|$Zspace) +$//o; # $str  EUC-JP  (perl5. 005)

$str =~ s/^($character_sjis*? )(? :\s|$Zspace_sjis) +$/$1/o; # $str  SJIS  

ũƮ ϰ ִ ؼ ǥ ùٸ ġŲ () ּ.

̽ ڸ , () ߸ ɼ ֽϴ.

# $str    ( ̽) Ѵ(Ǽ)
$str =~ s/(? :\s|$Zspace) +$//o; # $str  EUC-JP  
$str =~ s/(? :\s|$Zspace_sjis) +$//o; # $str  SJIS  

ڸ ϴ 쿡 ؼ Ư ϴٸ, ڸ 쿡 ̽ Ƽ Ʈ Ϻ  ߸ ɼ ֽϴ. , SJIS $str = '=@'; , ߸ ̸ ϴ. ڼϰԴ, perl ũƮ EUC-JP ùٸ ġŲ () ּ.

ž

Ѵ

$str EUC-JP ϴ ̹Ƿ, ʿϸ ̸ EUC-JP ȯ ּ. ڵ ȯ ؼ ڵ带 EUC-JP () ȯ óѴ .

# $str     迭 @chars  Ѵ

$ascii = '[\x00-\x7F]';
$twoBytes = '[\x8E\xA1-\xFE][\xA1-\xFE]';
$threeBytes = '\x8F[\xA1-\xFE][\xA1-\xFE]';

@chars = $str =~ /$ascii|$twoBytes|$threeBytes/og;

Թ ˱ , @chars = ($str =~ /($ascii|$twoBytes|$threeBytes) /og; () ᵵ մϴ. ó մϴ. EUC-JP 1 $ascii|$twoBytes|$threeBytes () ǥ Ÿ ִ ùٸ ġŲ ߽ϴ. ̰ ȣ ѷ ׷ ϰ ֽϴ. , Թ @chars ̹Ƿ, 캯Ʈ ˴ϴ. ġ Ʈ ϸ(), ׷ ǥ ϴ ij Ʈ ־ϴ. , ($1, $2, $3,) () ϴ Ʈ ־ϴ. g ٿ ֱ , ($1, $2, $3,, $1, $2, $3,) () ϴ Ʈ ־ ˴ϴ. ׷ ǰ ִ ǥ 1̹Ƿ, Ȯ EUC-JP 1 ҵ Ʈ ־ ˴ϴ.

ũƮ @chars Թ 캯 ü ȣ ѷΰ ʽϴٸ, ̰ = =~ (), @char = $str () ٰ ϴ ϴ.

, ǥ ü ȣ ѷΰ ʽϴٸ, g ٿ ִ ġ Ʈ , ǥȿ ȣ 1 ڵ ǥ ü ȣ ѷ ִͰ մϴ. , ǥ ü ȣ ѷմ 캸 ӵ Դϴ. ߿ $1 ()μ ϴ ͵ ƴϰ ǥ ü ȣ ѷѴ 쿡 ȣ ʴ Դϴ.

ž

Ư ̷ ǮѴ

# $str  $bytes Ʈ ǮѴ

require 'fold.pl';

while (length($str)) {
  (my $folded, $str) = fold($str, $bytes);
  print $folded, "\n";
}

fold.pl ( ī羾)() ϴ մϴ. fold.pl ʰ, ڰ ߴܵǰ ִ Ѵ ó ڰ ߴܵ ϸ鼭 substr Լ ǮѴٰ ϴ ֽϴٸ, Ϻη ʿ Դϴ. fold Լ 3 μ 1 ϸ, Ǯ $bytesƮ ģ 쿡 ̽ $bytesƮ ǵ() ϴ ֽϴ. , 4 μ 1 ϸ ܾ 迡 Ǯϰ ˴ϴ. ڼϰԴ fold.pl о ּ. ٿ fold.pl ڿ SJIS ݰ īŸī ϴ. ,EUC-JP ݰ īŸī μ ϱ , ݰ īŸī ̰ () ǥ ̰ ߻մϴ. ǥ ߰ 쿡, ݰ īŸī ̸ īŸī ȯ δ, Ǯϴ Ʈ ϰ ó ʿ䰡 ֽϴ.

Jcode.pm jfold Լ () ص ϴٸ, ܾ 迡 Ǯϰų ϴ.

μ ݰ īŸī Ģ ó ϸ鼭 Ǯϴ ũƮ Ǿ Ӵϴ. ũƮ EUC-JP $str EUC-JP ׷ٰ ϴ ̹Ƿ, ʿϸ ̸ EUC-JP () ȯ ּ. ڵ ȯ ؼ ڵ带 EUC-JP () ȯ óѴ .

# $str  Ģ ó ϸ鼭 ǮѴ

require 'fold.pl';
require 'jcode.pl';

$no_begin = "! %),. :;? ]}ˬࣧ,. 页" .
    "Ȣ" .
    "-! %),. ? ݣ";              # ٸӸ Ģ 
$no_begin_jisx0201 = "., Ȣ";
jcode::z2h_euc(\$no_begin_jisx0201);
$no_begin . = $no_begin_jisx0201;                 # ٸӸ Ģ (ݰ īŸī)
$no_end = "\$([{\ (ۣ";  #   Ģ 
$no_end_jisx0201 = " ";
jcode::z2h_euc(\$no_end_jisx0201);
$no_end . = $no_end_jisx0201;                     #   Ģ (ݰ īŸī)
$allow_end = $no_begin;                          # Ŵް ٸӸ Ģ 
$del_space = '(? :\s|\xA1\xA1)';                  # ϴ ٸӸ   
$basebytes = 74;                                 # ⺻
$maxbytes = 76;                                  # ִ
$ascii = '[\x00-\x7F]';
$twoBytes = '[\x8E\xA1-\xFE][\xA1-\xFE]';
$threeBytes = '\x8F[\xA1-\xFE][\xA1-\xFE]';

map {$no_begin{$_} = 1;} ($no_begin =~ /$ascii|$twoBytes|$threeBytes/og);
map {$no_end{$_} = 1;} ($no_end =~ /$ascii|$twoBytes|$threeBytes/og);
map {$allow_end{$_} = 1 + /[\xA1-\xFE]/ - /\x8E/;}
    ($allow_end =~ /$ascii|$twoBytes|$threeBytes/og);

sub fold_properly {
  my $str = shift;
  my($folded, $strtmp, $bytestmp, $begin_char, $end_char, $flag);
  $flag = 1; # ٸӸ Ģ ó (1:Ŵ޾ 0:߹)
  $bytestmp = $basebytes;
  $str =~ tr/\t\n\r\f/ /; #  ڸ ̽ ȯ
  $str =~ s/^$del_space+//o; # ٸӸ  
  ($begin_char) = %no_begin; # ٸӸ Ģ ڸ 1  
  while ($no_begin{$begin_char} or $no_end{$end_char}) {
    ($folded, $strtmp) = fold($str, $bytestmp, 0, 1);
    while (length($folded) - ($folded =~ tr/\x8E//) <= $basebytes and
   $strtmp ne '' and $flag) { # ݰ īŸī  ǥó
      ($folded, $strtmp) = fold($str, $bytestmp, 0, 1);
      my ($folded_tmp, $strtmp_tmp) = fold($str, $bytestmp + 1, 0, 1);
      if (length($folded_tmp) - ($folded_tmp =~ tr/\x8E//) <= $basebytes) {
        ($folded, $strtmp) = ($folded_tmp, $strtmp_tmp);
        $bytestmp++;
      } else {
        last;
      }
    }
    ($begin_char) = $strtmp =~ /^$del_space*($ascii|$twoBytes|$threeBytes) /o;
    ($end_char) = $folded =~ /($threeBytes|$twoBytes|$ascii) $/o;
    if ($flag) { # Ŵް Ģ ó
      if ($no_begin{$begin_char} and $allow_end{$begin_char}) { # Ŵް 
        if (length($folded) - ($folded =~ tr/\x8E//)
            + $allow_end{$begin_char} <= $maxbytes) {
          $bytestmp++;
        } else {
          $flag = 0;
          $bytestmp = $basebytes - 1 + ($folded =~ tr/\x8E//);
        }
      } else {
        $flag = 0;
        $bytestmp--;
      }
    } else {
      $bytestmp--;
    }
    if ($bytestmp == 0) { # Ģ ó Ұ
      ($folded, $strtmp) = fold($str, $basebytes, 0, 1);
      last;
    }
  }
  $folded =~ s/^((? :$ascii|$twoBytes|$threeBytes)*? (? =$del_space))
      $del_space+$/$1/ox; #    
  ($folded, $strtmp);
}

while (length($str)) {
  (my $folded, $str) = fold_properly($str);
  print $folded, "\n";
}
ž

Base64encodeڵѴ

$str EUC-JP ϴ ̹Ƿ, ʿϸ ̸ EUC-JP ȯ ּ. ڵ ȯ ؼ ڵ带 EUC-JP () ȯ óѴ .

# $data  Base64 encode  $encoded_data  䱸Ѵ

use MIME::Base64;

$encoded_data = encode_base64($data);

Base64encode Ϸ , MIME::Base64 encode_base64 Լ մϴ. Base64encodeڵ忡 ؼ RFC 2045 ( Ϻ ) ֽϴ. ⿡ Base64encode ó 76 ̳ ƴϸ ȵǸ ֽϴ. encode_base64 Լ 2 μ ʰ ҷ 쿡 ڵ 76 ڵ带 ־ Ǯ ݴϴ.

# $encoded_data  Base64 ڵ   $data  ǵ

use MIME::Base64;

$data = decode_base64($encoded_data);

Base64ڵϷ , MIME::Base64 decode_base64 Լ մϴ. $encoded_data 76 Ǯϱ ؼ() Եǰ ִ ڵ尡  äε ϴ.

encoded-word մϴ. encoded-word ؼ RFC 2047 ( Ϻ ) ֽϴ. encoded-word ϴ =? charset? encoding? encoded-text? = ׷ٰ ϴ ¸ Դϴ. =? ISO-2022-JP? B? GyRCTmMbKEI=? = "" ϴ ij encoded-word () Դϴ. ⿡ encoding B ߴ encoded-word մϴ.

encoding B ϴ encoded-text κ Bencode Ÿ ֽϴ. Bencode ϴ Base64encode encode Դϴٸ, encoded-word Base64encode θ ϰ Bencode θϴ.

# $str  Bencode  encoded-word  ȯѴ(ҿ)

require 'jcode.pl';
use MIME::Base64;

jcode::convert(\$str, 'jis', 'euc', 'z');
$str = '=? ISO-2022-JP? B? ' .  encode_base64($str, '') .  '? =';

BencodeϷ encode_base64 Լ ϸ ڽϴٸ, 2 μ ʴ encode ڵ尡 پ Ƿ, ڿ ڵ尡 ʰ ϰ ֽϴ. ,charset ISO-2022-JP ϴ , ̸ $str JIS ȯ ʿ䰡 ֽϴ. ȮϰԴ ISO-2022-JP ȯ ʿ䰡 ֽϴ. ISO-2022-JP () ȯϷ ⺻ JIS ȯ ָ ڽϴٸ, ISO-2022-JP ݰ īŸī ʽϴ. ű⼭ ݰ īŸī ԵǾ ־ 쿡 īŸī ȯ ʿ䰡 ֽϴ. ̰ Ϸ jcode::convert Լ 4 μ 'z' ݴϴ.

encoded-word ȯϴ ʴ ̸ŭԴϴٸ, ̰ ̸ RFC 2047 () ä ҿ Դϴ. RFC 2047 encoded-word ȯϴµ ־ Ű Ǵ . ü ֽϴ.

  1. encoded-word 75Ʈ̳ ƴϸ ȵȴ.
  2. encoded-word 76Ʈ̳ ƴϸ ȵȴ.
  3. encoded-word () ڵ ȴ.
  4. encoded-text ڵ ij ڵ, ASCII ° ƴϸ ȵȴ.
  5. encoded-word Ÿ ġ .
    • Subject Comment ʵ, 'text' ȿ .
    • "(" ") " ܶ 'comment' ȿ .
    • From To, CC ,'phrase' ȿ .
    • 'addr-spec' ȿ ؼ ȴ.
    • 'quoted-string' ȿ ؼ ȴ. .
  6. ̿ Ǵ encoded-word 'linear-white-space' Ѵ.

1 4 encoded-word ȯ ɴϴ. ũƮ 3 4 ؼ Ŭ ϰ ֽϴٸ, 1 2 ؼ Ű澲 ʽϴ. 1 2 ؼ ϱ ؼ ߴ Ͼϴ.

켱, 1 Դϴٸ,encoded-word ̰ 75Ʈ Ѵ 쿡,BencodeѴ ª ,2()̻ encoded-word ȯ ȵ˴ϴ. 2()̻ encoded-word () ؼ(),Bencode encoded-text () 3 ä() ֽϴٸ, ׷ 4 () ä Ǿ ϴ. 4 ä鼭 ª Ϸ , ij ȵǾ, иϰϺ ª ʽϴ. , ڵ ߿ ȵȴٰ ϴ Դϴ. Ϻ ª (), Ĵ jcode.pl JIS ȯϸ, ڵ ڵ尡 ASCII ° ǵ() ݴϴ.

, 2 ϴ մϴ. encoded-word () 76Ʈ̳ ƴϸ ȵȴٰ ϴ , encoded-word ȯ , ȯ 76Ʈ̳ Ǿ ֵ() encoded-word ̸ ؾ Ѵٰ ϴ ˴ϴ. , encoded-word ȯϸ() 76Ʈ Ѿ 쿡, Ǯʿ䰡 ֽϴ.

̻ encoded-word ȯ ü () ˴ϴٸ, ϴ ־, װ 5 Դϴ. , κ encoded-word () ȯϸ , ϴ Դϴ. κ ڵϸ() ϴ ͵ ˴ϴ. ij ־ ó϶ () ֺм̳ м ʿϰ Ǿ ϴ. ⿡ ſ ű ,encoded-word ȯϰ κ, ȯ ϰ κ ־ ũƮ ϴ.

# $str  encoded-word  ȯ $line  ߰Ѵ

require 'jcode.pl';
use MIME::Base64;

$ascii = '[\x00-\x7F]';
$twoBytes = '[\x8E\xA1-\xFE][\xA1-\xFE]';
$threeBytes = '\x8F[\xA1-\xFE][\xA1-\xFE]';

sub add_encoded_word {
  my($str, $line) = @_;
  my $result;

  while (length($str)) {
    my $target = $str;
    $str = '';
    if (length($line) + 22 +
	($target =~ /^(? :$twoBytes|$threeBytes) /o) * 8 > 76) {
      $line =~ s/[ \t\n\r]*$/\n/;
      $result . = $line;
      $line = ' ';
    }
    while (1) {
      my $encoded = '=? ISO-2022-JP? B? ' .
      encode_base64(jcode::jis($target, 'euc', 'z'), '') .  '? =';
      if (length($encoded) + length($line) > 76) {
	$target =~ s/($threeBytes|$twoBytes|$ascii) $//o;
	$str = $1 .  $str;
      } else {
	$line . = $encoded;
	last;
      }
    }
  }
  $result .  $line;
}

$line = add_encoded_word($str, $line);
࿹
$line = 'Subject: ';
$str = '̰ ׽ƮԴϴ. This is test. ';
$line = add_encoded_word($str, $line);
print $line, "\n";

 
Subject: =? ISO-2022-JP? B? GyRCJDMkbCRPJUYlOSVIJEckOSElGyhCVGhpcyBpcyB0ZXN0? =
 =? ISO-2022-JP? B? Lg==? =

ũƮ $line $str encoded-word () ȯϰ ߰մϴ. $str ,encoded-word 75Ʈ̳ ǵ() ̰ () ϴٸ ũƮ ϰ ʽϴ. , κ encoded-word ұԴϴٸ,RFC 2047 encoded-word ȯ ʿ䰡 , , ASCII ִκ ϼǴ ܾ ȯϴ õ ֽϴ. ׷ϱ, is test. Բ encoded-word ȯ ϴ ʹ ϴ. ̰Ϳ ؼ, Subject unstructured header 쿡 ũƮ ϴ.

# unstructured header $header  MIME encode Ѵ
# add_encoded_word() ؼ  ũƮ 

sub mime_unstructured_header {
  my $oldheader = shift;
  my($header, @words, @wordstmp, $i) = ('');
  my $crlf = $oldheader =~ /\n$/;
  $oldheader =~ s/\s+$//;
  @wordstmp = split /\s+/, $oldheader;
  for ($i = 0; $i < $#wordstmp; $i++) {
    if ($wordstmp[$i] ! ~ /^[\x21-\x7E]+$/ and
	$wordstmp[$i + 1] ! ~ /^[\x21-\x7E]+$/) {
      $wordstmp[$i + 1] = "$wordstmp[$i] $wordstmp[$i + 1]";
    } else {
      push(@words, $wordstmp[$i]);
    }
  }
  push(@words, $wordstmp[-1]);
  foreach $word (@words) {
    if ($word =~ /^[\x21-\x7E]+$/) {
      $header =~ /(? :. *\n)*(. *)/;
      if (length($1) + length($word) > 76) {
	$header . = "\n $word";
      } else {
	$header . = $word;
      }
    } else {
      $header = add_encoded_word($word, $header);
    }
    $header =~ /(? :. *\n)*(. *)/;
    if (length($1) == 76) {
      $header . = "\n ";
    } else {
      $header . = ' ';
    }
  }
  $header =~ s/\n?  $//mg;
  $crlf ?  "$header\n" : $header;
}

$header = mime_unstructured_header($header);
࿹
$header = "Subject: ASCII Ϻ ASCII Ϻ ASCII ASCII\n";
$header = mime_unstructured_header($header);
print $header;

 
Subject: ASCII =? ISO-2022-JP? B? GyRCRnxLXDhsGyhCIEFTQ0lJGyRCJEhGfEtcGyhC? =
 =? ISO-2022-JP? B? GyRCOGwbKEI=? = ASCII ASCII

ũƮ ũƮ Լ add_encoded_word() ̿ϰ ֽϴ. ũƮ $line = add_encoded_word($str, $line); () , ũƮ մϴ.

ũƮ κп ܾ ϰ ֽϴ. ⼭ ҵǾ ܾ , ASCII ִκ ϼǴ ܾΰ  encoded-word ȯ  ϴ. 6 ʿ䰡 ֽϴ. ڵ encoded-word 'linear-white-space' õ˴ϴٸ, ̰ 1 ̰ 쿡,encoded-word () ϱ ؼ Ե ʿ 'linear-white-space' () ϱ Դϴ. ׷,κ Ѵ 'linear-white-space' encoded-word () ȯ (), ڵ ߸ Ǿ () ˴ϴ. ű⼭,'linear-white-space' encoded-word () ȯ ʿ䰡 ִ 쿡,'linear-white-space' ܾ 1 encoded-word μ ȯմϴ.

# $str  Bڵ encoded-word   ǵ

require 'jcode.pl';
use MIME::Base64;

$lws = '(? :(? :\x0D\x0A|\x0D|\x0A)? [ \t])+';
$ew_regex = '=\? ISO-2022-JP\? B\? ([A-Za-z0-9+/]+=*)\? =';
$str =~ s/($ew_regex) $lws(? =$ew_regex) /$1/gio;
$str =~ s/$lws/ /go;
$str =~ s/$ew_regex/decode_base64($1) /egio;
jcode::convert(\$str, 'euc', 'jis');

ũƮ ־ ij $str encoded-word ǵϴ. ̿ ȴ encoded-word 'linear-white-space' () մϴ. encoded-word "(" ̴ٵ簡,'linear-white-space'encoded-word ̸, ׷ ϰ encoded-word ,쿬 ׷ ij ̶ ؼ, ǵ ؾ ʽϴ. ׷, ũƮ encoded-word ǵ , ij $str () ִ () , ǵ ͸ ʿ䰡 ֽϴ. , $str = q{"=? ISO-2022-JP? B? GyRCTmMbKEI=? ="}; quoted-string ̹Ƿ, ȿ encoded-word Ÿ ϴ. ̰ ǵ ȵ˴ϴ.

Outlook Express encoded-word ȯ ٺ긣ũƮ ѷ quoted-string ϹǷ,RFC 2047 () ä ϴ. Outlook Express 5 ϴ. ׷,Outlook Express 5 Ѵ κ Ϸ encoded-word () 76Ʈ̳ ƴϸ ȵȴٰ ϴ ä ʽϴ.

encoded-word ȯ ϴ ũƮμ mime_pls (Ÿ 뺸羾) () ͵ ǰ ֽϴ. ׷, ̰͵ RFC 2047 ϰ ä ִ ƴմϴ. encoded-word ȯ ؼ, Subject ̳ From ̸ ʰ ڸƮ ó ϴ. , word ʱ , $str = "test ׽Ʈ "; () ij ȯ, ȯ ϸ() "test ׽Ʈ " () ̽  ϴ. Ư $`, $&, $' ϰ Ƿ, ġ ӵ ʾ ֽϴ. encoded-word κ ȯ ؼ, ó ϰ encoded-word ̴ ͱ ǵ ϴ. ̰ ùٸ ϱ ؼ() ƹ м ʿϰ ˴ϴ.

Jcode.pm MIME encode Լ mime_encode MIME ڵ Լ mime_decode 0.63Ŀ ũ Ʈ äǰ ֽϴ.

RFC 2047 ϰ ä ִ encoded-word ȯ ϴ ũƮμ IM(Internet Message) IM::Iso2022jp ֽϴ. ǥ ƴϱ , ϱ ؼ() IM ν ʿ䰡 ֽϴ. Iso2022jp.pm ּ.

ž

URI ̽ ̽ Ѵ

'̽' ϴ ij '%a5%a8%a5%b9%a5%b1%a1%bc%a5%d7' URI ̽Ϸ ϴ.

# $str  URI ̽ Ѵ

$str =~ s/(\W)/'%' .  unpack('H2', $1) /eg;

ݴ '%a5%a8%a5%b9%a5%b1%a1%bc%a5%d7' ׷ٰ ϴ ij URI ̽ '̽' ׷ٰ ϴ ij ο ǵ ϴ.

# $str  URI  ̽ Ѵ

$str =~ s/%([0-9A-Fa-f][0-9A-Fa-f]) /pack('H2', $1) /eg;

ġũ δ, URI ̽ ̽ Ѵ ӵ . ̿ μ unpack Լ ʰ sprintf Լ ord Լ() Ѵٵ簡, pack Լ 'H2' ʰ hex Լ chr Լ, Ȥ, hex Լ pack Լ 'C' Ѵٵ簡, i Ѵٵ簡, {2} Ѵٵ簡 ֽϴٸ, Ư ʿ Դϴ. ,'%A5%A8%A5%B9%A5%B1%A1%BC%A5%D7' () ĺ 빮ڷ ȯص ϴٸ, sprintf Լ ord Լ Ǿ, ó ʾϴ.

, ؽÿ ||= () , ̿ϴ ֽϴٸ, CGI Ѵ κ ũƮ .

# $str  URI ̽ Ѵ(̿)

$str =~ s/(\W) /$escape{$1} ||= '%' .  unpack('H2', $1) /eg;
# $str  URI  ̽ Ѵ(̿)

$str =~ s/%([0-9A-Fa-f][0-9A-Fa-f]) /$unescape{$1} ||= pack('H2', $1) /eg;

̿ , ̿Ϸ ϴ κ, , '%' . unpack('H2', $1) pack('H2', $1) ׸ŭ ó ƴϱ Դϴ. κ ó 쿡, ̿ϴ ȿ ɴϴٸ, ̹ ׸ŭ ó ƴ 쿡, ؽø ϰų ||= 带 ؼ() ݴ ʾ ϴ. ġũ ߴµ, URI ̽ ̿ǿ̿700%, , ̿ κ ȿ Ѵٰ ϴ ϴ.

ݴ ϸ, 뷮 óϷ 쿡 ȿ ִٰ ϴ Դϴٸ, ׷ ̸ ȯ ̺ غ д () ӵ Դϴ.

# $str  URI ̽ Ѵ(ȯ ̺)

foreach $i (0x00 .. 0xFF) {
  $escape{chr($i)} = sprintf('%%%02x', $i);
}

$str =~ s/(\W) /$escape{$1}/g;
# $str  URI  ̽ Ѵ(ȯ ̺)

foreach $i (0x00 .. 0xFF) {
  $unescape{sprintf('%02x', $i)} = chr($i);
  $unescape{sprintf('%02X', $i)} = chr($i);
}

$str =~ s/%([0-9A-Fa-f][0-9A-Fa-f]) /$unescape{$1}/g;

ȯ ̺ǿ ʷ ȯ ̺ غѴٰ ϴ ó ʿϰ ˴ϴٸ, ȯ ü e , ij ǹǷ ӵ ϴ.

URI ̽ Ǵ Դϴٸ, ũƮ ܼ \W ϰ ;ϴ? ׷, ̰ ϰԴURI ̽ ʿ䰡 ڱ URI ̽ ϴ. ݵURI ̽ؾ XX ڴ RFC 2396 ( Ϻ ) unreserved μ XX XX ̿ܿ XXϴ. unreserved ̿ ڸURI ̽ ũƮ Ͽ ϴ.

# $str  URI ̽ l(ʿ ּ)

$str =~ s/([^a-zA-Z0-9_.!~*'()-])/'%' .  unpack('H2', $1) /eg;

⿡ CGI URI Ư ̾߱Ⱑ ˴ϴ.

URI ̽Ϸ , RFC 2396 URI μ ִ uric μ ǵǰ ִ ܸ̿ ̽ Ƹ , URI::Escape uri_escape Լ , ǥ [;\/? :@&=+\$, A-Za-z0-9\-_.!~*'()] ׸ ǥǴ ܸ̿ ̽ ϸ ȴ١ ϴ ̾߱Ⱑ ֽϴٸ, ̰ǼԴϴ. ȮϰԴ,  ǹ̷δ װ ϴٸ, Ƹ CGI ־ κ Ǽ. uri_escape Լ ϰ XX, URI Է URI ̿ ڸ ̽ ͂;, CGI ϵ ̽ Ϸ δ ǹ̰ ;. , $value = 'A&B=C'; , print "http://foo.bar/cgi-bin/hoge.cgi? value=$value"; ()  ȴٰ մϱ? uri_escape Լ $value ̽ ص & = () URI ̹Ƿ ̽ ʽϴ. , value=A B=C Ѵ 2() & ϰ ִٰ ؼǾ ϴ. uri_escape Լ 2 μ ȯ ϴ ڸ ֽϴ. ٸ, ϰ ִ ũƮ ̹Ƿ, Ϻη ǥ ƴ URI::Escape ν ͵ Դϴ.

, ̽ + ȣ ȯ ̾߱⸦ մϴ. CGI  ͸ dzִ μ, FORM GET Ǵ POST ϴ Ŀ μμ dzִ 2 ֽϴ. 2() ̽ + ȣ ȯ Ⱑ ٸ ֽϴ. FORM GET Ǵ POST ϴ ؼ HTML 4.0 ( Ϻ ) 17.13. 4 Form content types content types Ʈ application/x-www-form-urlencoded encode μ ֽϴ. Ŀ μμ dzִ پ CGI/1. 1 5. The CGI Script Command Line ֽϴ.

application/x-www-form-urlencoded encode control names values ̽ + ȯ, ̿ reserved character %HH Ŀ URI ̽ . ׸, controle names values = ܶ , () & ܶ þϴ. , ̽ + () ȯ, ̿ reserved character URI ̽ , name1=value1&name2=value2 ׷ٰ ϴ ϴ Դϴ. control names values ؼ ϴ ó κ ˴ϴ.

# $str   encode  ó κ Ѵ

$str =~ s/([^\w ])/'%' .  unpack('H2', $1) /eg;
$str =~ tr/ /+/;

̽ + ȯ DZ , URI ̽ s/%20/+/g; ȯ ٽ ϴ ֽϴٸ, ũƮ ̽ ؼ ʿ ó ʰ () ӵ Դϴ.

# $str   ڵ  ó κ Ѵ

$str =~ tr/+/ /;
$str =~ s/%([0-9A-Fa-f][0-9A-Fa-f]) /pack('H2', $1) /eg;

URI ̽ (),+ ȯǰ ִ ̽ ʰ ǹǷ, + () ̽ ǵ Ӵϴ. $str =~ s/\+/ /g; ص մϴٸ, ȯ̹Ƿ ӵ tr Լ մϴ.

Ϲ ǰ ִ URL encode ϴ Ű ִ Դ 𸨴ϴٸ, application/x-www-form-urlencoded encode URL encode Ѵٸ, reserved character %HH ȯϴ URI ̽ ó URL encode ϴ Ǽ . ,URI ̽ URL encode Ѵٸ, ̽ + () ȯ ȴٰ ϴ Ǽ ˴ϴ.

, Ŀ μμ dzִ Դϴٸ, search-string = search-word *( "+" search-word ) ǰ ֽϴ. ü ϸ, http://foo.bar/cgi-bin/hoge.cgi? arg1+arg2+arg3 ˴ϴ. ̽ + ȯѴٰ Ѵ ̾߱ 𿡵 ϴ. search-string ܶ ִ + ʺ + , ̽ ȯ () ƴմϴ. ,search-string ̽ ԵǾ ־ 쿡, reserved character URI ̽ǰ ǹǷ %20 ȯǰ ˴ϴ. ̽ Ʋ + ȯ (), $value = 'A B C'; , ̰ CGI μμ dzַ print "http://foo.bar/cgi-bin/hoge.cgi? $value"; () ϴ ϸ(), http://foo.bar/cgi-bin/hoge.cgi? A+B+C Ǿ, hoge.cgi 3 μ 'A','B', 'C' ް Ǿ ϴ. ̰ http://foo.bar/cgi-bin/hoge.cgi? A%20B%20C () ϴ ˴ϴ.

Ŀ μμ dzִ ̽ + ȯǰ ִ ƴϱ , + ̽ ǵ ؼ ȴٰ ϴ ˴ϴ. Ŀ μμ dzִ QUERY_STRING κ query κ, , ? κ ֽϴ. , FORM GET Ǵ POST ϴ Ŀ μμ dzִ 𿡼 Ͱ dz׹޴ 𸣴 쿡, QUERY_STRING κ ͸ ޾ óϱ ؼ, + () ̽ ȯؾ ΰ ȯؼ ȵǴ Ǵ ʿ䰡 ֽϴ. Ǵϴ ,QUERY_STRING = ԵǾ ִ  մϴ. ԵǾ , װ application/x-www-form-urlencoded encode ϰ ִ ˴ϴ. Ե , Ŀ μμ Ͱ dz׹ ˴ϴ.

ž

ڵ带 Ѵ

s/\x0D\x0A|\x0D|\x0A/\n/g;

ũƮ Windows(DOS), Mac, UNIX  ÷ ڵ带 ÷ ڵ忡 մϴ. ڵ Windows(DOS) \x0D\x0A, Mac \x0D, UNIX \x0A ̹Ƿ, ̰͵ ڵ忡 Ϸ \x0D\x0A|\x0D|\x0A ʿ䰡 ֽϴ. ʴ \x0D\x0A () ݵ ȵ˴ϴ.

ڵ带 ϱ ؼ s/\r\n|\r/\n/g; () ǼԴϴ.