#!/perl/bin/perl # # mknmz.pl - indexer of Namazu # Version 1.3.0.6 [03/12/1999] # # Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved. # This is free software with ABSOLUTELY NO WARRANTY. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either versions 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA # 02111-1307, USA # # This file must be encoded in EUC-JP encoding # package main; require 5.003; use Cwd; use Time::Local; use strict; # be strict since v1.2.0 ## ## software information ## my $VERSION = "1.3.0.6"; my $COPYRIGHT = "Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved."; my $NMZ_URL = "http://www.ring.gr.jp/openlab/namazu/"; # Japanese usage my $USAGE_JA = < -a: すべてのファイルを対象とする -c: 日本語の単語のわかち書きに ChaSen を用いる -e: ロボットよけされているファイルを除外する -h: Mail/News のヘッダ部分をそれなりに処理する -k: 日本語の単語のわかち書きに KAKASI を用いる -m: ChaSen の形態素解析の品詞情報を利用する (名詞のみ登録) -q: インデックス処理の最中にメッセージを表示しない -r: man のファイルを処理する -u: uuencode と BinHex の部分を無視する -x: HTML のヘディングによる要約作成を行わない (文書の先頭から作成) -D: Date:, From: といったヘッダを要約につけない (ディフォルトではつける) -E: 単語の両端の記号は削除する (ディフォルトでは含める) -G: 送り仮名を削除する (ディフォルトでは含める) -H: 平仮名のみの単語は登録しない (ディフォルトでは登録を行う) -K: 記号はすべて削除する (ディフォルトでは登録を行う) -L: 行頭・行末の調整処理を行わない (ディフォルトでは調整を行う) -M: MHonArc で作成された HTML の処理を行わない (ディフォルトでは行う) -P: フレーズ検索用のインデックスを作成しない (ディフォルトでは作成する) -R: 正規表現検索用のインデックスを作成しない (ディフォルトでは作成する) -U: URLのencodeを行わない (ディフォルトでは行う) -W: 日付によるソート用のインデックス作らない (ディフォルトでは作成する) -X: フィールド検索用のインデックスを作らない (ディフォルトでは作成する) -Y: 削除された文書の検出を行わない (ディフォルトでは行う) -Z: 文書の更新/削除を反映しない (ディフォルトでは行う) -A: .htaccess で制限されたファイルを除外する -l (lang): 言語を設定する ('en' or 'ja') -F (file): インデックス対象のファイルのリストを読み込む -I (file): ユーザ定義のファイルをインクルードする -O (dir) : インデックスファイルの出力先を指定する -T (dir) : NMZ.{head,foot,body}.* のディレクトリを指定する -t (regex): 対象ファイルの正規表現を指定する EOFusage my $USAGE_EN = < -a: target all files -c: use ChaSenI as Japanese processor -e: exclude files which has robot exclusion -h: treat header part of Mail/News well -k: use KAKASI as Japanese processor -m: use ChaSenI as Japanese processor with morphological processing -q: suppress status messages during execution -r: treat man files -u: decode uuencoded part and discard BinHex part -x: do not make summary with structure of HTML's headings -D: do not insert headers such as 'Date:' to summary (default: off) -E: delete symbols on edge of word (default: off) -G: delete Okurigana in word (default: off) -H: ignore words consist of Hiragana only (default: off) -K: delete all symbols (default: off) -L: do not adjust beginning and end of line (default: off) -M: do not do special processing for MHonArc (default: off) -P: do not make the index for phrase search (default: off) -R: do not make the index for regexp search (default: off) -U: do not encode URL (default: off) -W: do not make the index for sort by date (default: off) -X: do not make the index for field search (default: off) -Y: do not detect deleted documents (default: off) -Z: do not detect update and deleted documents (default: off) -A: exclude files restricted by .htaccess -l (lang): specify the language ('en' or 'ja', default:en) -I (file): include user defined file in advance of index processing -F (file): load a file which contains list of target files -O (dir) : specify a directory to output the index -T (dir) : specify a directory where NMZ.{head,foot,body}.* are -t (regex): specify a regex for target files EOFusage ## ## required softwares ## ## Network Kanji Filter nkf v1.62 my $NKF = "nkf32"; ## KAKASI or ChaSen (to handle Japanese characters) ## KAKASI must have -w option added by Hajime BABA-san my $KAKASI = "kakasi -ieuc -oeuc -w"; ## ChaSen 1.51 (simple wakatigaki) my $CHASEN = "chasen -j -F '\%m '"; ## ChaSen 1.51 (with morphological processing) my $CHASEN_MORPH = "chasen -j -F '\%m %H\\n'"; ## Default Japanese processer my $WAKATI = $KAKASI; my $MorphOpt = 1 if "KAKASI" eq "CHASEN_MORPH"; ## Table of helper programs and extentions my %HELPER_PROGRAMS = ( 'gz' => 'zcat', 'Z' => 'zcat', 'man' => 'groff -man -Tnippon', ); ## Make regex of extensions my $HELPER_EXTENSIONS = join('|', sort {length($b) <=> length($a)} keys %HELPER_PROGRAMS); ## ## Names of Index files ## my $DBNAME = "NMZ"; my $FLIST = "$DBNAME.f"; my $FLISTINDEX = "$DBNAME.fi"; my $INDEX = "$DBNAME.i"; my $INDEXINDEX = "$DBNAME.ii"; my $HASH = "$DBNAME.h"; my $REGLIST = "$DBNAME.r"; my $HEADERFILE = "$DBNAME.head"; my $FOOTERFILE = "$DBNAME.foot"; my $LOGFILE = "$DBNAME.log"; my $SLOGFILE = "$DBNAME.slog"; my $LOCKFILE = "$DBNAME.lock"; my $LOCKFILE2 = "$DBNAME.lock2"; my $LOCKMSGFILE = "$DBNAME.msg"; my $BODYMSGFILE = "$DBNAME.body"; my $ERRORSFILE = "$DBNAME.err"; my $BIGENDIAN = "$DBNAME.be"; my $LITTLEENDIAN = "$DBNAME.le"; my $WAKATITMP = "$DBNAME.wkc.$$"; my $TMP_I = "$DBNAME.tmp_i.$$"; my $TMP_W = "$DBNAME.tmp_w.$$"; my $TMP_P = "$DBNAME.tmp_p.$$"; my $TMP_PI = "$DBNAME.tmp_pi.$$"; my $WORDLIST = "$DBNAME.w"; my $PHRASE = "$DBNAME.p"; my $PHRASEINDEX = "$DBNAME.pi"; my $FIELDINFO = "$DBNAME.field"; my $DATEINDEX = "$DBNAME.t"; my $TOTALFILESCOUNT = "$DBNAME.total"; ## ## Default values ## my $LIBDIR = "/usr/local/namazu/lib"; # directory contains library and etc. my $LANGUAGE = "ja"; # language of messages #$SYSTEM = "WIN32"; # UNIX/MSWin32/os2 my $ADMIN = 'webmaster@foobar.jp'; # admin's email address my $CGI_ACTION = '/cgi-bin/namazu.exe'; #
's ACTION の指定 ## Prefix of URL (\t will be treated as full path name) my $URL_PREFIX = "\t"; ## Files can be omission in URL. e.g. index.html my $DEFAULT_FILE = "_default"; ## Target files' regex my $TARGET_FILE = '.*\.html?|.*\.txt|.*_default'; ## Non-Target files' regex my $DENY_FILE = '.*\.gif|.*\.(jpg|jpeg)|.*\.tar\.gz|core|.*\.bak|.*~|\..*|\x23.*|NMZ\..*'; ## HTML extentions like .htm, .html, .shtml, .phtml, .html.en, .html.ja, .asp my $HTML_SUFFIX = 'html?|[ps]html|html\.[a-z]{2}|asp|cgi'; ## Place where CGI prgrams is in. e.g. /cgi-bin/, /htbin/ my $CGI_DIR = '/(cgi-bin|htbin)/'; ## MHonArc's message file my $MHONARC_MESSAGE_FILE = 'msg\d{5}\.html(?:\.gz)?'; ## MHonArc's header for identification (regex) my $MHONARC_HEADER = '<\!-- MHonArc v\d\.\d\.\d -->'; ## Mail/News's headers should be remained as searchable text ## (case insensitive) my $REMAIN_HEADER = "From|Date|Message-ID"; ## Mail/News's headers should be inserted in search results my $SUMMARY_HEADER = "From|Date|Author|Newsgroups"; ## Mail/News's headers for field specified search (NMZ.field.*) my $SEARCH_FIELD = "Message-Id|Subject|From|Date|Url|Newsgroups|To"; ## Aliases for NMZ.field.* my %FIELD_ALIASES = ('title' => 'subject', 'author' => 'from'); my $TEXT_TITLE = " (Text File) "; # text file my $NO_TITLE = "No title in original"; # document has no title ## ## Size of files indexed at once on memory. (bytes) ## If you have much memory, you can increase this value. (128MB or more) ## If you have not much memory, you can decrease this value. (32MB or less) ## my $ON_MEMORY_MAX = 5000000; # M K bytes ## File size limitation. The file larger than this value will not be indexed. my $FILE_SIZE_LIMIT = 600000; # M K bytes ## Word length limitation. The word longer than this value will be ignored. my $WORD_LENG_MAX = 128; ## ## Weights for HTML elements ## Element names should be described in CAPITAL letter ## my $TITLEW = 16; # only TITLE has own variable my %TAGW = (); $TAGW{'H1'} = 8; $TAGW{'H2'} = 7; $TAGW{'H3'} = 6; $TAGW{'H4'} = 5; $TAGW{'H5'} = 4; $TAGW{'H6'} = 3; $TAGW{'A'} = 4; $TAGW{'STRONG'} = 2; $TAGW{'EM'} = 2; $TAGW{'KBD'} = 2; $TAGW{'SAMP'} = 2; $TAGW{'VAR'} = 2; $TAGW{'CODE'} = 2; $TAGW{'CITE'} = 2; $TAGW{'ABBR'} = 2; $TAGW{'ACRONYM'} = 2; $TAGW{'DFN'} = 2; ## Weight for Mail/News's header my $REMAIN_HEADER_W = 8; ## ## タグを捨てる際に空白を挿入しないタグ ## 例えば、これは重要です。などという文脈ではタグは削除すべき ## だが、 This is foo.
That is bar. という文脈ではタグは空白に変換すべき ## my $NON_SEPARATION_TAGS = 'A|TT|CODE|SAMP|KBD|VAR|B|STRONG|I|EM|CITE|FONT|U|'. 'STRIKE|BIG|SMALL|DFN|ABBR|ACRONYM|Q|SUB|SUP|SPAN|BDO'; ## タグによる重みづけする際にこの数字以上の長さの文字列の場合は重 ## みづけをしない ( を文字サイズの指定のために本文全体を囲ったり ## している人がいるための処置) my $INVALID_LENG = 128; ## ## Weight for ## my $METAKEYW = 32; ## Length of summary my $SUMMARY_LENGTH = 200; ## ## robots.txt に関する設定 ## my $HTDOCUMENT_ROOT = "/usr/local/apache/share/htdocs"; my $HTDOCUMENT_ROOT_URL_PREFIX = "/usr/local/apache/share/htdocs"; my $ROBOTS_TXT = "$HTDOCUMENT_ROOT/robots.txt"; my $ROBOTS_EXCLUDE_URLS = ""; # hogehoge my $DeletedFilesCount = 0; my $UpdatedFilesCount = 0; my $APPENDMODE = 0; my $LastKeyN = 0; my $INTSIZE = 4; my $UnsignedCmp = 0; my @FList = (); my @Seed = (); my %PreupdatedFields = (); my %PhraseHash = (); my %KeyIndex = (); my $SYSTEM = ""; my $PSC = "/"; my $CCS = "euc"; my $LOCK_MSG_JA = ""; my $LOCK_MSG_EN = ""; my $BODY_MSG_JA = ""; my $BODY_MSG_EN = ""; my $LIBDIR2 = ""; my $DATEINDEX_ = ""; my $TARGET_DIR = ""; my $FLIST_ = ""; my $INDEX_ = ""; my $HEADERFILE_ = ""; my $FOOTERFILE_ = ""; my $PHRASE_ = ""; my $PHRASEINDEX_ = ""; my $REGLIST_ = ""; my $FLISTINDEX_ = ""; my $INDEXINDEX_ = ""; my $HASH_ = ""; my $WORDLIST_ = ""; # options my $NoPhraseIndexOpt = 0; my $DebugOpt = 0; my $QuietOpt = 0; my $RobotExcludeOpt = 0; my $NoFieldInfoOpt = 0; my $NoDateIndexOpt = 0; my $ManOpt = 0; my $NoMHonArcOpt = 0; my $UuencodeOpt = 0; my $MailNewsOpt = 0; my $NoLineAdOpt = 0; my $NoHeadAbstOpt = 0; my $HiraganaOpt = 0; my $OkuriganaOpt = 0; my $NoEdgeSymbolOpt = 0; my $NoSymbolOpt = 0; my $NoEncodeURL = 0; my $NoRegexpIndexOpt = 0; my $NoInsertHeaderOpt = 0; my $NoDeleteProcessing = 0; my $NoUpdateProcessing = 0; my $HtaccessExcludeOpt = 0; ## ## Program begins ## # STDOUT->autoflush(1); $| = 1; # autoflush STDIN initialize(); main(); sub main () { my ($swap, $all_file_size, $cfile_size, $file_count, $cfile, $start_time, $file_segment, $tmp); $file_segment = 0; $start_time = time; $file_segment = preparation_process(); set_lockfile(); $swap = 1; $file_count = 0; $all_file_size = 0; my $key_count = 0; # Process target files one by one foreach $cfile (@FList) { $cfile_size = namazu_core($cfile, $file_count, $file_segment); unless ($cfile_size) { $cfile = "" ; # remove @FList entry next; } $all_file_size += $cfile_size; $file_count++; if ($all_file_size > $ON_MEMORY_MAX * $swap) { if (%KeyIndex) { $key_count = put_index(); put_phrase_hash() unless $NoPhraseIndexOpt; } $swap++; } } if (%KeyIndex) { $key_count = put_index(); put_phrase_hash() unless $NoPhraseIndexOpt; } remain_process($all_file_size, $file_count, $key_count, $start_time); } sub dprint (@) { print STDERR @_ if $DebugOpt; } # Initializer # $PSC: Path Separate Character '/' or '\' # $CCS: Character Coding System 'euc' or 'sjis' sub initialize () { get_int_size(); @Seed = init_seed(); $SYSTEM= $^O; # $^O contains system name if (($SYSTEM eq "MSWin32") || ($SYSTEM eq "os2")) { $PSC = "\\"; $CCS = "sjis"; $0 =~ m#^([A-Z]:)(/|\\)#i, $LIBDIR = $1 . $LIBDIR if ($LIBDIR !~ /^[A-Z]:/i); } else { $PSC = "/"; $CCS = "euc"; } $LIBDIR2 = cwd() . "${PSC}..${PSC}lib"; } # Core routine sub namazu_core ($$$) { my ($cfile, $file_count, $file_segment) = @_; my ($url, $cfile_size, $ctrl, $kanji, %fields); my ($title, $weighted_str, $contents, $headings, $err); $headings = ""; $contents = ""; $weighted_str = ""; $url = url_decchiagator($cfile); # Make a URL from a file name ($cfile_size, $ctrl, $kanji) = load_document(\$cfile, \$contents); # Do checking if ($err = check_file($cfile, \$contents, $ctrl, $cfile_size)) { print $file_count + $file_segment . " $url $err\n" unless $QuietOpt; print ERRORSFILE "$cfile $err\n"; return 0; # return with 0 if error } if ($RobotExcludeOpt) { if ($url =~ m/$ROBOTS_EXCLUDE_URLS/i) { $err = "is excluded because of /robots.txt.\n"; print $file_count + $file_segment . " $url $err\n"; print ERRORSFILE "$cfile $err\n"; return 0; # return with 0 if error } elsif ($cfile =~ /\.($HTML_SUFFIX)$/i && $contents =~ /META\s+NAME\s*=\s*([\'\"]?)ROBOTS\1\s+[^>]* CONTENT\s*=\s*([\'\"]?).*?(NOINDEX|NONE).*?\2[^>]*>/ix) { $err = "is excluded because of element."; print $file_count + $file_segment . " $url $err\n" unless $QuietOpt; print ERRORSFILE "$cfile $err\n"; return 0; # return with 0 if error } } # Output processing file name as URL print $file_count + $file_segment . " $url\n" unless $QuietOpt; document_filter($cfile, \$title, \$contents, \$weighted_str, \$headings, \%fields); make_field_info(\%fields, $cfile, $title, $url); put_file_info($url, $title, $cfile_size, \$contents, \$headings, $cfile, \%fields); put_field_info(\%fields) unless $NoFieldInfoOpt; put_dateindex($cfile) unless $NoDateIndexOpt; $contents .= $weighted_str; # add weight info count_words($file_count, $file_segment, \$contents, $kanji); make_phrase_hash($file_count, $file_segment, \$contents) unless $NoPhraseIndexOpt; $cfile_size; } sub make_field_info (\%$$$) { my ($fields, $cfile, $title, $url) = @_; unless (defined($fields->{date})) { my $mtime = (stat($cfile))[9]; my $date = rfc822time($mtime); $fields->{date} = $date; } unless (defined($fields->{title})) { my $tmp = $title; decode_entity(\$tmp); # since $title has been already encoded $fields->{title} = $tmp; } unless (defined($fields->{url})) { $fields->{url} = $url; } } # RFC 822 format without timezone sub rfc822time ($) { my ($time) = @_; my @week_names = ("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"); my @month_names = ("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"); my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime($time); sprintf("%s, %.2d %s %d %.2d:%.2d:%.2d", $week_names[$wday], $mday, $month_names[$mon], $year + 1900, $hour, $min, $sec); } # output the field infomation into NMZ.fileds.* files sub put_field_info (\%) { my ($orig_fields) = @_; my ($key, @keys, $field); my (%fields) = %{$orig_fields}; my $aliases_regex = join('|', sort {length($b) <=> length($a)} keys %FIELD_ALIASES); foreach $field (keys %{fields}) { if ($field =~ /^($aliases_regex)$/) { unless (defined($fields{$FIELD_ALIASES{$field}})) { $fields{$FIELD_ALIASES{$field}} = $fields{$field}; } undef $fields{$field}; } } @keys = split('\|', $SEARCH_FIELD); foreach $key (@keys) { $key = lc($key); my $fname = "$FIELDINFO.$key.$$"; open(FIELD, ">>$fname") || die "$fname: $!\n"; if (defined($fields{$key})) { $fields{$key} =~ s/\s+/ /g; $fields{$key} =~ s/\s+$//; $fields{$key} =~ s/^\s+//; print FIELD $fields{$key}, "\n"; } else { print FIELD "\n"; } close(FIELD); $PreupdatedFields{$key} = 1; } } # put the date infomation into NMZ.t file sub put_dateindex ($) { my ($cfile) = @_; my $mtime = (stat($cfile))[9]; open(DATEINDEX, ">>$DATEINDEX_") || die "$DATEINDEX_: $!\n"; binmode(DATEINDEX); print DATEINDEX pack("i", $mtime); close(DATEINDEX); } # load a document file sub load_document ($$) { my ($orig_cfile, $contents) = @_; my ($line, $omake, $size, $ctrl, $kanji, $zipped, $filter, $ext); my $cfile = $$orig_cfile; return (0, 0, 0) unless (-f $cfile && -r $cfile); $ctrl = 0; $size = -s $cfile; return ($size, $ctrl, 0) if $size > $FILE_SIZE_LIMIT; $filter = ""; while ($cfile =~ /^.*\.($HELPER_EXTENSIONS)$/) { $ext = $1; if ($filter eq "") { $filter = "$HELPER_PROGRAMS{$ext} \"$cfile\" |"; } else { $filter .= "$HELPER_PROGRAMS{$ext} |"; } # if .gz or .Z, suppress the extention and continue if ($ext =~ /^(gz|Z)$/) { $zipped = 1; $cfile =~ s/\.$ext$//; } else { last; } } if ($LANGUAGE eq "ja") { if ($filter eq "") { $filter = "$NKF -emXZ1 \"$cfile\" |"; } else { $filter .= "$NKF -emXZ1 |"; } } else { if ($filter eq "") { $filter = "$cfile"; } } if ($ManOpt) { # man mode if ($filter =~ /\|$/) { $filter .= "$HELPER_PROGRAMS{'man'} |"; } else { $filter = "$HELPER_PROGRAMS{'man'}" . $filter . "|"; } } # consider a filename containing Shift_JIS under OS/2. $filter =~ s|\\|\\\\|g if $SYSTEM eq "os2"; open(CFILE, $filter) || die "$cfile: $!\n"; $$contents = join("", ); # if a zipped file, the size has been changed if ($zipped) { $size = length($$contents); return ($size, $ctrl, 0) if $size > $FILE_SIZE_LIMIT; } if ($ManOpt) { # processing like col -b (2byte character acceptable) $$contents =~ s/_\x08//g; $$contents =~ s/\x08{1,2}([\x20-\x7e]|[\xa1-\xfe]{2})//g; } $$contents =~ s/[ \t]+/ /g; # remain LFs v1.03 $$contents =~ s/\r\n/\n/g; # remain LFs is for ChaSen $$contents =~ s/\r/\n/g; # CR+LF or CR are into LF # Control characters be into space $ctrl = $$contents =~ tr/\x00-\x09\x0b-\x1f\xff/ /; $kanji = $$contents =~ tr/\xa1-\xfe/\xa1-\xfe/; # Kanji contained? close(CFILE); ($size, $ctrl, $kanji); } # not implimented yet. sub analize_rcs_stamp() { } # Filters sub document_filter ($$$$$$\%) { my ($orig_cfile, $title, $contents, $weighted_str, $headings, $fields) = @_; my ($mhonarc_opt); my $cfile = $orig_cfile; $cfile =~ s/\.(gz|Z)$//; # zipped file analize_rcs_stamp(); $mhonarc_opt = 1 if (!$NoMHonArcOpt && $$contents =~/^$MHONARC_HEADER/); if (ishtml($cfile)) { mhonarc_filter($contents, $weighted_str) if $mhonarc_opt; html_filter($contents, $weighted_str, $title, $fields, $headings); } elsif ($cfile =~ /rfc\d+\.txt/i ) { rfc_filter($contents, $weighted_str, $title); } elsif ($ManOpt) { man_filter($contents, $weighted_str, $title); } uuencode_filter($contents) if $UuencodeOpt; if ($mhonarc_opt || $MailNewsOpt) { mailnews_filter($contents, $weighted_str, $title, $fields); mailnews_citation_filter($contents, $weighted_str); } line_adjust_filter($contents) unless $NoLineAdOpt; line_adjust_filter($weighted_str) unless $NoLineAdOpt; white_space_adjust_filter($contents); filename_to_title($cfile, $title, $weighted_str) unless $$title; show_filter_debug_info($contents, $weighted_str, $title, $fields, $headings); } # Show debug information for filters sub show_filter_debug_info ($$$$) { my ($contents, $weighted_str, $title, $fields, $headings) = @_; dprint "-- title --\n$$title\n"; dprint "-- contents: --\n$$contents\n"; dprint "-- weighted_str: --\n$$weighted_str\n"; dprint "-- headings: --\n$$headings\n"; } # Adjust white spaces sub white_space_adjust_filter ($) { my ($text) = @_; $$text =~ s/^ +//gm; $$text =~ s/ +$//gm; $$text =~ s/ +/ /g; $$text =~ s/\n+/\n/g; } # ファイル名からタイトルを取得 (単なるテキストファイルの場合) sub filename_to_title ($\$\$) { my ($cfile, $title, $weighted_str) = @_; my ($tmp); # for MSWin32's filename using Shift_JIS [09/24/1998] if (($SYSTEM eq "MSWin32") || ($SYSTEM eq "os2")) { $cfile = codeconv::shiftjis_to_eucjp($cfile); } $cfile =~ /^.*\Q$PSC\E([^\Q$PSC\E]*)$/ ; my $filename = $1; # ファイル名を元にキーワードを割り出してみる v1.1.1 # modified [09/18/1998] $tmp = $filename; $tmp =~ s|/\\_\.-| |g; $$weighted_str .= "\x7f$TITLEW\x7f$tmp\x7f/$TITLEW\x7f\n"; $$title = $filename . $TEXT_TITLE; } # HTML 用のフィルタ sub html_filter ($$$$$) { my ($contents, $weighted_str, $title, $fields, $headings) = @_; escape_lt_gt($contents); get_html_title($contents, $weighted_str, $title); get_author($contents, $fields); get_meta_info($contents, $weighted_str); get_img_alt($contents); get_table_summary($contents); get_title_attr($contents); normalize_html_tag($contents); erase_above_body($contents); weight_tag($contents, $weighted_str, $headings); erase_html_tags($contents); # それぞれ実体参照の復元 decode_entity($contents); decode_entity($weighted_str); decode_entity($headings); } # 単独の < > を実体参照に変換し、保護する # この処理は Perl の正規表現置換の仕様により、二重に行います sub escape_lt_gt ($) { my ($contents) = @_; $$contents =~ s/\s<\s/ < /g; $$contents =~ s/\s>\s/ > /g; $$contents =~ s/\s<\s/ < /g; $$contents =~ s/\s>\s/ > /g; } sub get_author ($$) { my ($contents, $fields) = @_; my ($author); # if ($$contents =~ m!]*?HREF=([\"\'])mailto:(.*?)\1>!i) { $fields->{author} = $2; } elsif ($$contents =~ m!.*]*>([^<]*?)!i) { my $tmp = $1; $tmp =~ s/\s//g; if ($tmp =~ /\b([\w\.\-]+\@[\w\.\-]+(?:\.[\w\.\-]+)+)\b/) { $fields->{author} = $1; } } } # TITLE を取り出す などにも考慮しています # が二つ以上あっても大丈夫 v1.03 sub get_html_title ($$$$) { my ($contents, $weighted_str, $title) = @_; if ($$contents =~ s/]*>([^<]+)<\/TITLE>//i) { $$title = $1; # TITLE を TITLEW 倍して末尾に追加 $$weighted_str .= "\x7f$TITLEW\x7f$$title\x7f/$TITLEW\x7f\n"; } else { $$title = $NO_TITLE; } $$title =~ s/\s+/ /g; $$title =~ s/^\s+//; $$title =~ s/\s+$//; } # に対応する処理 sub get_meta_info ($$) { my ($contents, $weighted_str) = @_; $$weighted_str .= "\x7f$METAKEYW\x7f$3\x7f/$METAKEYW\x7f\n" if $$contents =~ /]*CONTENT\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/i; $$weighted_str .= "\x7f$METAKEYW\x7f$3\x7f/$METAKEYW\x7f\n" if $$contents =~ /]*CONTENT\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/i; } # foo の foo の取り出し # HTML の扱いは厳密ではないです sub get_img_alt ($) { my ($contents) = @_; $$contents =~ s/]*\s+ALT\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; } # の foo の取り出し sub get_table_summary ($) { my ($contents) = @_; $$contents =~ s/]*\s+SUMMARY\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; } # の foo の取り出し sub get_title_attr ($) { my ($contents) = @_; $$contents =~ s/<[A-Z]+[^>]*\s+TITLE\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; } # などを に統一 (エレメントをすべて削除) sub normalize_html_tag ($) { my ($contents) = @_; $$contents =~ s/<([!\w]+)\s+[^>]*>/<$1>/g; } sub erase_above_body ($) { my ($contents) = @_; $$contents =~ s/^.*//si; } # %TAGW に設定されている数値に応じて \x7fXX\x7f, \x7f/XX\x7f という架空のタグ # を作り、これで挟んでおく (\x7f は予めすべて空白に変換してある) # 単語のカウントの際にこの架空のタグを利用して計算する # が最初に処理されるように sort keys しています(安直)。 # というのは は他のタグの内側に来ることが多いからです # 厳密な入れ子処理は行っていません # さらに については要約作成のために怪しい処理をしている sub weight_tag ($$$ ) { my ($contents, $weighted_str, $headings) = @_; my ($tag); foreach $tag (sort keys(%TAGW)) { my ($tmp, $tagw); $tmp = ""; $$contents =~ s/<($tag)>([^<]*)<\/$tag>/weight_tag_sub($1, $2, \$tmp)/gie; $$headings .= $tmp if $tag =~ /^H[1-6]$/ && ! $NoHeadAbstOpt && $tmp; $tagw = $tag =~ /^H[1-6]$/ && ! $NoHeadAbstOpt ? $TAGW{$tag} : $TAGW{$tag} - 1; $$weighted_str .= "\x7f$tagw\x7f$tmp\x7f/$tagw\x7f\n" if $tmp; } } # HTML タグをすべて削除,タグによって空白を入れたり入れなかったりする sub erase_html_tags ($) { my ($contents) = @_; 1 while ($$contents =~ s/<\/?([^<>]*)>/tag_to_space_or_null($1)/ge); } # 指定されたタグの文字列を処理するためのサブルーチン sub weight_tag_sub ($$$) { my ($tag, $text, $tmp) = @_; my ($space); $space = tag_to_space_or_null($tag); $text =~ s/<[^>]*>//g; $$tmp .= "$text " if (length($text)) < $INVALID_LENG; $tag =~ /^H[1-6]$/i && ! $NoHeadAbstOpt ? " " : "$space$text$space"; } # numberd entity の復元を行う / 無効な値ははじく sub decode_numbered_entity ($) { my ($num) = @_; return "" if $num >= 0 && $num <= 8 || $num >= 11 && $num <= 31 || $num >=127; sprintf ("%c",$num); } # 実体参照の復元 ISO-8859-1 の右半分は無視します # HTML 2.x で拡張された numbered entity には未対応です # どちらも日本語 EUC では無理なのです # " < > のように空白で続けて最後に ; をつける記述も大丈夫 v1.03 sub decode_entity ($) { my ($text) = @_; return unless defined($$text); $$text =~ s/&#(\d{2,3})[;\s]/decode_numbered_entity($1)/ge; $$text =~ s/"[;\s]/\"/g; $$text =~ s/&[;\s]/&/g; $$text =~ s/<[;\s]//g; $$text =~ s/ / /g; ## 特別扱い v1.1.2.1 } # '<' と '>' '&' を実体参照へ変換 sub encode_entity ($) { my ($tmp) = @_; $$tmp =~ s/&/&/g; # & は最初に処理しないとまずい $$tmp =~ s//>/g; $$tmp; } # 指定されたタグが単に削除すべきものか空白に変換すべきか判定する sub tag_to_space_or_null ($) { $_[0] =~ /^($NON_SEPARATION_TAGS)$/i ? "" : " "; } # MHonArc 用のフィルタ # MHonArc v2.1.0 が標準で出力する HTML を想定しています sub mhonarc_filter ($$) { my ($contents, $weighted_str) = @_; # MHonArc を使うときはこんな感じに処理すると便利 $$contents =~ s/.*//s; $$contents =~ s/.*//s; $$contents =~ s/.*//s; $$contents =~ s//\n/; # ヘッダと本文を区切る $$contents =~ s/^
  • //gim; # ヘッダの前に空白をあけたくないから $$contents =~ s/<\/?EM>//gi; # ヘッダの名前をインデックスにいれたくない $$contents =~ s/^\s+//; } # Mail/News 用のフィルタ # 元となるものは古川@ヤマハさんにいただきました sub mailnews_filter ($$$\%) { my ($contents, $weighted_str, $title, $fields) = @_; my ($line, $boundary, $partial, @tmp); $$contents =~ s/^\s+//; # 1 行目がヘッダっぽくないファイルは、ヘッダ処理しない return unless ($$contents =~ /(^\S+:|^from )/i); @tmp = split(/\n/, $$contents); HEADER_PROCESSING: while (@tmp) { $line = shift(@tmp); last if ($line =~ /^$/); # if an empty line, header is over # connect the two lines if next line has leading spaces while (defined($tmp[0]) && $tmp[0] =~ /^\s+/) { # if connection is Japanese character, remove spaces # from Furukawa-san's idea [09/22/1998] my $nextline = shift(@tmp); $line =~ s/([\xa1-\xfe])\s+$/$1/; $nextline =~ s/^\s+([\xa1-\xfe])/$1/; $line .= $nextline; } # keep field info if ($line =~ /^(\S+):\s(.*)/i) { my $name = $1; my $value = $2; $fields->{lc($name)} = $value; if ($name =~ /^($REMAIN_HEADER)$/i) { # keep some fields specified REMAIN_HEADER for search keyword $$weighted_str .= "\x7f$REMAIN_HEADER_W\x7f$value\x7f/$REMAIN_HEADER_W\x7f\n"; } } if ($line =~ s/^subject:\s*//i){ $$title = $line; encode_entity($title); # ML 特有の [hogehoge-ML:000] を読み飛ばす。 # のが意図だが、面倒なので、 # 実装上、最初の [...] を読み飛ばす。 $line =~ s/^\[.*?\]\s*//; # 'Re:' を読み飛ばす。 $line =~ s/\bre:\s*//gi; $$weighted_str .= "\x7f$TITLEW\x7f$line\x7f/$TITLEW\x7f\n"; } elsif ($line =~ s/^content-type:\s*//i) { if ($line =~ /multipart.*boundary="(.*)"/i){ $boundary = $1; dprint "((boundary: $boundary))\n"; } elsif ($line =~ m!message/partial;\s*(.*)!i) { # The Message/Partial subtype routine [10/12/1998] # contributed by Hiroshi Kato $partial = $1; dprint "((partial: $partial))\n"; } } } if ($partial) { # MHonARC makes several empty lines between header and body, # so remove them. while(@tmp) { last if (! $line =~ /^\s*$/); $line = shift(@tmp); } undef $partial; goto HEADER_PROCESSING; } $$contents = join("\n", @tmp); if ($boundary) { # MIME の Multipart をそれなりに処理する $boundary =~ s/(\W)/\\$1/g; $$contents =~ s/This is multipart message.\n//i; # MIME multipart processing, # modified by Furukawa-san's patch on [1998/08/27] $$contents =~ s/--$boundary(--)?\n?/\xff/g; my (@parts) = split(/\xff/, $$contents); $$contents = ''; foreach $_ (@parts){ if (s/^(.*?\n\n)//s){ my ($head) = $1; $$contents .= $_ if $head =~ /^content-type:.*text\/plain/mi; } } } } # Mail/News の引用マークを片付ける # また冒頭の名乗るだけの行や、引用部分、◯◯さんは書きましたなどの行は # 要約に含まれないようにする (やまだあきらさんのアイディアを頂きました) sub mailnews_citation_filter ($$) { my ($contents, $weighted_str) = @_; my ($line, $omake, $i, @tmp); $omake = ""; $$contents =~ s/^\s+//; @tmp = split(/\n/, $$contents); $$contents = ""; # 冒頭の名乗り出る部分を処理 (これは最初の 1,2 行めにしかないでしょう) for ($i = 0; $i < 2 && defined($tmp[$i]); $i++) { if ($tmp[$i] =~ /(^\s*((([\xa1-\xfe][\xa1-\xfe]){1,8}|([\x21-\x7e]{1,16}))\s*(。|.|\.|,|,|、|\@|@|の)\s*){0,2}\s*(([\xa1-\xfe][\xa1-\xfe]){1,8}|([\x21-\x7e]{1,16}))\s*(です|と申します|ともうします|といいます)(.{0,2})?\s*$)/) { # デバッグ情報から検索するには perl -n00e 'print if /^<<<>>>\n\n"; $omake .= $tmp[$i] . "\n"; $tmp[$i] = ""; } } # 引用部分を隔離 foreach $line (@tmp) { # 行頭に HTML タグが来た場合は引用処理しない if ($line !~ /^[^>]*)|([\>\|\:\#]+\s*))+//) { $omake .= $line . "\n"; $$contents .= "\n"; # 改行をいれよう next; } $$contents .= $line. "\n"; } # ここでは空行を区切りにした「段落」で処理している # 「◯◯さんは△△の記事において□□時頃書きました」の類いを隔離 @tmp = split(/\n\n+/, $$contents); $$contents = ""; $i = 0; foreach $line (@tmp) { # 完全に除外するのは無理だと思われます。こんなものかなあ # この手のメッセージはせいぜい最初の 5 段落くらいに含まれるかな # また、 5 行より長い段落は処理しない。 # それにしてもなんという hairy 正規表現だろう… if ($i < 5 && ($line =~ tr/\n/\n/) <= 5 && $line =~ /(^\s*(Date:|Subject:|Message-ID:|From:|件名|差出人|日時))|(^.+(返事です|reply\s*です|曰く|いわく|書きました|言いました|話で|wrote|said|writes|says)(.{0,2})?\s*$)|(^.*In .*(article|message))|(<\S+\@([\w-.]\.)+\w+>)/im) { dprint "\n\n<<<<$line>>>>\n\n"; $omake .= $line . "\n"; $line = ""; next; } $$contents .= $line. "\n\n"; $i++; } $$weighted_str .= "\x7f1\x7f$omake\x7f/1\x7f\n"; } # RFC 用のフィルタ # わりと書式はまちまちみたいだからそれなりに sub rfc_filter ($$$) { my ($contents, $weighted_str, $title) = @_; $$contents =~ s/^\s+//s; $$contents =~ s/((.+\n)+)\s+(.*)//; $$title = $3; encode_entity($title); $$weighted_str .= "\x7f1\x7f$1\x7f/1\x7f\n"; $$weighted_str .= "\x7f$TITLEW\x7f$$title\x7f/$TITLEW\x7f\n"; # summary または Introductionがあればそれを先頭に持ってくる $$contents =~ s/([\s\S]+^(\d+\.\s*)?(Abstract|Introduction)\n\n)//im; $$weighted_str .= "\x7f1\x7f$1\x7f/1\x7f\n"; } # man 用のフィルタ # よくわからないからいいかげんに sub man_filter ($$$) { my ($contents, $weighted_str, $title) = @_; my ($name); $$contents =~ s/^\s+//gs; $$contents =~ /^(.*?)\s*\S*$/m; $$title = "$1"; encode_entity($title); $$weighted_str .= "\x7f$TITLEW\x7f$$title\x7f/$TITLEW\x7f\n"; if ($$contents =~ /^(?:NAME|名前|名称)\s*\n(.*?)\n\n/ms) { $name = "$1::\n"; $$weighted_str .= "\x7f$TAGW{'H1'}\x7f$1\x7f/$TAGW{'H1'}\x7f\n"; } if ($$contents =~ s/(.+^(?:DESCRIPTION 解説|DESCRIPTIONS?|SHELL GRAMMAR|INTRODUCTION|【概要】|解説|説明|機能説明|基本機能説明)\s*\n)//ims) { $$contents = $name . $$contents; $$weighted_str .= "\x7f1\x7f$1\x7f/1\x7f\n"; } } # uuencode の読み飛ばしルーチンは古川@ヤマハさんがくださりました。[09/28/1997] # 重ね重ね感謝です。後日 BinHex も追加してもらいました [11/13/1997] # 私がいじったことによるバグを修正してくださいました [02/05/1998] Thanks! sub uuencode_filter ($) { my ($contents) = @_; my ($line, @tmp, $uunumb); my ($uuord, $uuin); @tmp = split(/\n/, $$contents); $$contents = ""; while (@tmp) { $line = shift(@tmp); $line .= "\n"; # BinHex の読み飛ばし # 仕様がよく分からないので、最後まで飛ばす last if $line =~ /^\(This file must be converted with BinHex/; #) # uuencode の読み飛ばし # 参考文献 : SunOS 4.1.4 の man 5 uuencode # FreeBSD 2.2 の uuencode.c # 偶然マッチしてしまった場合のデメリットを少なくするため # 本体のフォーマットチェックを行なう # # News などでファイルを分割して投稿されているものの場合 begin がない # ことがあるのでそれを考慮します by S.Takabayashi [v1.0.5] # 偶然マッチすることはほとんどないとは思いますが… # # length は 62 と 63 があるみたい… [v1.0.5] # もしかしたら他にも違いがあるのかも # # 仕様を忠実に表現すると、 # int((ord($line) - ord(' ') + 2) / 3) # != (length($line) - 2) / 4 # となるが、式を変形して… # 4 * int(ord($line) / 3) != length($line) + $uunumb; # SunOS の uuencode は、encode に空白も使っている。 # しかし、空白も認めると、一般の行を uuencode 行と誤認する # 可能性が高くなる。 # 折衷案として、次のケースで認める。 # begin と end の間 # 前の行が uuencode 行と判断されて、ord が前の行と同じ # 一行が 0x20-0x60 の文字のみで構成される場合のみ uuencode # とみなす v1.1.2.3 (bug fix) $uuin = 1, next if $line =~ /^begin [0-7]{3,4} \S+$/; if ($line =~ /^end$/){ $uuin = 0,next if $uuin; }else{ # ここで、ord の値は 32-95 の範囲に $uuord = 32 if ($uuord = ord($line)) == 96; # uunumb = 38 の行が loop の外に出ていると、 # 一般の行で 63 文字の行があったら誤動作してしまう $uunumb = (length($line)==63)? 37: 38; if ((32 <= $uuord && $uuord < 96) && length($line) <= 63 && (4 * int($uuord / 3) == length($line) + $uunumb)){ if ($uuin == 1 || $uuin == $uuord){ next if $line =~ /^[\x20-\x60]+$/; } else { # beginから始まっていないものは厳しくしよう [05/22/1998] $uuin = $uuord, next if $line =~ /^M[\x21-\x60]+$/; } } } $uuin = 0; $$contents .= $line; } } # 行頭・行末の空白、タブ、行頭の > | # を削除 (':' もつけ加えた by 高林) # 行末が日本語で終わる場合は改行コードを削除 # この部分のコードは古川@ヤマハさんがくださりました。[09/15/1997] # 英文ハイフォネーションの解除は私が付け足しました # 40文字未満の行について行末の日本語連結処理を行わないようにした v1.1.1 sub line_adjust_filter ($) { my ($text) = @_; my ($line, @tmp); return if (!defined($$text)); @tmp = split(/\n/, $$text); foreach $line (@tmp) { $line .= "\n"; $line =~ s/^[ \>\|\#\:]+//; $line =~ s/ +$//; $line =~ s/\n// if (($line =~ /[\xa1-\xfe]\n*$/) && (length($line) >=40)); $line =~ s/(。|、)$/$1\n/; $line =~ s/([a-z])-\n/$1/; # for hyphenation. } $$text = join('', @tmp); } # 準備 sub preparation_process ($$$) { my ($output_dir, $target_dir, $file_segment); $file_segment = 0; ($output_dir, $target_dir) = get_commandline_opt(); dbnamechange($output_dir); check_present_index(); ParseRobotsTxt() if ($RobotExcludeOpt); my $current_dir = cwd(); chdir $target_dir || die "$target_dir: $!\n"; $TARGET_DIR = cwd(); # $URL_PREFIX が \t なら $target_dir の cwd を元にセット v1.1.1 $URL_PREFIX = cwd() . "$PSC" if $URL_PREFIX eq "\t"; find::findfiles($PSC) unless @FList; chdir $current_dir; $file_segment = do_append_preprocessing() if -e $REGLIST; unless (@FList) { # if @FList is empty print "No files to index.\n"; exit; } if ($SYSTEM eq "MSWin32") { # 例によって Win32 のパイプは変なので別処理になる open(FLIST, ">$FLIST_") || die "Can't open $FLIST_.\n"; } else { if ($LANGUAGE eq "ja") { open(FLIST, "|$NKF -jZ >$FLIST_") || die "$FLIST_: $!\n"; } else { open(FLIST, ">$FLIST_") || die "$FLIST_: $!\n"; } } binmode(FLIST); open(ERRORSFILE, ">>$ERRORSFILE") || die "$ERRORSFILE: $!\n"; binmode(ERRORSFILE); return $file_segment; } sub set_lockfile () { # make a lock file if (-f $LOCKFILE2) { print "NMZ.lock2 found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n"; exit 1; } else { open(LOCKFILE2, ">$LOCKFILE2"); print LOCKFILE2 "$$"; # save pid close(LOCKFILE2); } } sub remove_lockfile () { # lock ファイルを削除する unlink $LOCKFILE2; } # 既存のインデックスの byte order を確認する sub check_present_index () { if (((is_big_endian()) && -f $LITTLEENDIAN) || ((!is_big_endian()) && -f $BIGENDIAN)) { die "!!CAUTION!!\nPresent index was made with opposite byte order.\nYou should run 'rvnmz' to change it.\n"; } } # 後処理 sub remain_process ($$$$) { my ($all_file_size, $file_count, $key_count, $start_time) = @_; my ($tmp, @tmp); close(FLIST); close(ERRORSFILE); @tmp = grep(!/^$/, @FList); if (@tmp) { if ($SYSTEM eq "MSWin32" && $LANGUAGE eq "ja") { # MSWin32 だと書き直してあげないといけない。 open(FLIST, "$NKF -jZ $FLIST_|") || die "$FLIST_: $!\n"; open(FLISTTMP, ">$FLIST_.tmp") || die "$FLIST_.tmp: $!\n"; binmode(FLISTTMP); print FLISTTMP while ; close(FLISTTMP); close(FLIST); Rename("$FLIST_.tmp", $FLIST_); } append_flist() if $APPENDMODE; make_flist_index(); put_registration_file(); put_lock_msg(); put_body_msg(); update_field_info(); update_dateindex(); put_nmz_files(); put_endian_stamp(); make_slog_file(); } else { # No files are indexed remove_backup_files(); } make_headfoot_pages($file_count, $key_count); put_log($all_file_size, $start_time, $file_count, $key_count); remove_lockfile(); } sub make_headfoot_pages($$) { my ($file_count, $key_count) = @_; make_headfoot("$HEADERFILE.ja", $file_count, $key_count); make_headfoot("$FOOTERFILE.ja", $file_count, $key_count); make_headfoot("$HEADERFILE.en", $file_count, $key_count); make_headfoot("$FOOTERFILE.en", $file_count, $key_count); } sub remove_backup_files() { unlink("$FLIST_.base"); unlink($FLIST_); unlink($INDEX_); unlink("$HEADERFILE_.ja"); unlink("$FOOTERFILE_.ja"); unlink("$HEADERFILE_.en"); unlink("$FOOTERFILE_.en"); unlink($PHRASE_); unlink($PHRASEINDEX_); } # コマンドラインの引数の処理 sub get_commandline_opt () { my ($target_dir, $target_loaded, $output_dir); $output_dir = ""; usage() if (@ARGV == 0); while (defined($ARGV[0]) && $ARGV[0] =~ /^-/) { $TARGET_FILE = ".*" if $ARGV[0] =~ /a/; $WAKATI = $KAKASI, $MorphOpt = 0 if $ARGV[0] =~ /k/; $WAKATI = $CHASEN, $MorphOpt = 0 if $ARGV[0] =~ /c/; $WAKATI = $CHASEN_MORPH, $MorphOpt = 1 if $ARGV[0] =~ /m/; $UuencodeOpt = 1 if $ARGV[0] =~ /u/; $MailNewsOpt = 1 if $ARGV[0] =~ /h/; if ($ARGV[0] =~ /r/) { $ManOpt = 1; $TARGET_FILE = '.*\.\d.*'; } $HiraganaOpt = 1 if $ARGV[0] =~ /H/; $OkuriganaOpt = 1 if $ARGV[0] =~ /G/; $NoEdgeSymbolOpt = 1 if $ARGV[0] =~ /E/; $NoSymbolOpt = 1 if $ARGV[0] =~ /K/; $NoLineAdOpt = 1 if $ARGV[0] =~ /L/; $NoMHonArcOpt = 1 if $ARGV[0] =~ /M/; $NoEncodeURL = 1 if $ARGV[0] =~ /U/; $DebugOpt = 1 if $ARGV[0] =~ /d/; $NoHeadAbstOpt = 1 if $ARGV[0] =~ /x/; $RobotExcludeOpt = 1 if $ARGV[0] =~ /e/; $QuietOpt = 1 if $ARGV[0] =~ /q/; $NoPhraseIndexOpt = 1 if $ARGV[0] =~ /P/; $NoRegexpIndexOpt = 1 if $ARGV[0] =~ /R/; $NoInsertHeaderOpt = 1 if $ARGV[0] =~ /D/; $NoDateIndexOpt = 1 if $ARGV[0] =~ /W/; $NoFieldInfoOpt = 1 if $ARGV[0] =~ /X/; $NoDeleteProcessing = 1 if $ARGV[0] =~ /Y/; $NoUpdateProcessing = 1 if $ARGV[0] =~ /Z/; $HtaccessExcludeOpt = 1 if $ARGV[0] =~ /A/; if ($ARGV[0] =~ /O$/) { shift @ARGV; $output_dir = $ARGV[0]; $output_dir =~ s|\Q$PSC\E*$||; print "Index output directory: $ARGV[0]\n" unless $QuietOpt; } elsif ($ARGV[0] =~ /T$/) { shift @ARGV; $LIBDIR = $ARGV[0]; $LIBDIR =~ s|\Q$PSC\E*$||; } elsif ($ARGV[0] =~ /I$/) { shift @ARGV; include($ARGV[0]); print "Included: $ARGV[0]\n" unless $QuietOpt; } elsif ($ARGV[0] =~ /l$/) { # small letter of 'L' shift @ARGV; $LANGUAGE = $ARGV[0]; } elsif ($ARGV[0] =~ /F$/) { shift @ARGV; load_target_list($ARGV[0]); print "Loaded: $ARGV[0]\n" unless $QuietOpt; $target_loaded = 1; $target_dir = cwd(); } elsif ($ARGV[0] =~ /t$/) { shift @ARGV; print "TARGET: $ARGV[0]\n" unless $QuietOpt; $TARGET_FILE = $ARGV[0]; } shift @ARGV; } usage() if (@ARGV == 0 && !$target_loaded && $output_dir eq ""); if ($#ARGV > 0 || $#ARGV == 0 && $target_loaded) { $URL_PREFIX = $ARGV[0]; shift @ARGV; } $target_dir = $ARGV[0] if defined $ARGV[0]; $output_dir = cwd() if $output_dir eq ""; die "$output_dir: invalid output directory\n" unless (-d $output_dir && -w $output_dir); if ($SYSTEM eq "MSWin32") { $target_dir =~ s/\//\\/g; $output_dir =~ s/\//\\/g; } ($output_dir, $target_dir); } sub include($) { my ($filename) = @_; open(INCLUDE, $filename) or die "$filename: $!"; my $code = join('',); close(INCLUDE); eval $code; } sub load_target_list ($) { my ($file) = @_; my $cwd = cwd(); open(TLIST, "$file") || die "$file: $!\n"; @FList = ; close(TLIST); # convert a relative path into an absolute path grep(s/^\.\Q$PSC\E/$cwd$PSC/, @FList); if (($SYSTEM eq "MSWin32") || ($SYSTEM eq "os2")) { grep(s/^([A-Z](?!\Q:$PSC\E))/$cwd$PSC$1/i, @FList); } else { grep(s/^([^\Q$PSC\E])/$cwd$PSC$1/, @FList); } grep(chop, @FList); # traverse directories # this routine is not efficent but I prefer reliable logic. my @tmp = @FList; my @tmp2 = (); @FList = (); while (@tmp) { $_ = shift (@tmp); if (s!\Q$PSC\E$!! && -d $_) { # path ending with $PSC my $cwd = cwd(); chdir $_; find::findfiles($PSC); push(@tmp2, @FList); @FList = (); chdir $cwd; } else { push(@tmp2, $_); } } @FList = @tmp2; } sub usage () { if ($LANGUAGE eq "ja") { if ($CCS eq "euc") { print STDERR $USAGE_JA; } elsif ($CCS eq "sjis") { open(NKF, "|$NKF -s"); print NKF $USAGE_JA; close(NKF); } } else { print STDERR $USAGE_EN; } exit; } # make a URL from a file name sub url_decchiagator ($) { my ($tmp) = @_; return undef unless defined $tmp; my $url = $tmp; # remove a file name if omittable $url =~ s!(.*)\Q$PSC\E($DEFAULT_FILE)(\?.*)?$!$1/$3!; $url =~ s/\Q$TARGET_DIR$PSC\E/$URL_PREFIX/; if (($SYSTEM eq "MSWin32") || ($SYSTEM eq "os2")) { # Shift_JIS の漢字を考慮して \ を / に変換 [09/26/1998] $url =~ s!([\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x01-\x7f])! $1 eq "\\" ? "/" : $1!gex; $url =~ s#^([A-Z]):#/$1|#i; # ドライヴ部分を /C| のように置き換え } unless ($NoEncodeURL) { # Escape unsafe characters (not strict) $url =~ s/\%/%25/g; # Convert original '%' into '%25' v1.1.1.2 $url =~ s/([^a-zA-Z0-9~\-\_\.\/\:\%])/ sprintf("%%%02X",ord($1))/ge; if (($SYSTEM eq "MSWin32") || ($SYSTEM eq "os2")) { # restore '|' for drive letter rule of Win32, OS/2 $url =~ s!^/([A-Z])%7C!/$1|!i; } } $url; } # check the file -- 0: OK / 1: NG sub check_file ($$$$) { my ($cfile, $contents, $ctrl, $size) = @_; # コントロール文字が全体の 3 % よりも多ければバイナリファイル # とみなし、スキップする (-B は問題があるので使わない) if ($size == 0) { "is 0 size! skipped."; } elsif (int ($ctrl / $size * 100) > 3) { "may be a BINARY file! skipped." } elsif ($size > $FILE_SIZE_LIMIT) { "is too LARGE file! skipped."; } elsif (!$NoMHonArcOpt && $cfile !~ /($MHONARC_MESSAGE_FILE)$/ && $$contents =~ /^$MHONARC_HEADER/) { "is MHonArc's index file! skipped."; } else { ""; } } # update REGLIST = NMZ.r file sub put_registration_file () { update_registration_file() if -e $REGLIST_; # preupdated file exists return if (@FList == 0); open(REGLIST, ">>$REGLIST") || die "$REGLIST: $!\n"; binmode(REGLIST); @FList = grep($_ ne '', @FList); print REGLIST join("\n", @FList), "\n"; print REGLIST "## indexed: " . rfc822time(time()) . "\n\n"; close(REGLIST); } # Rename *.$$ to each real file name sub put_nmz_files () { # Set the lock file open(LOCKFILE, ">>$LOCKFILE"); close(LOCKFILE); Rename($FLIST_, $FLIST); Rename($FLISTINDEX_, $FLISTINDEX); Rename($INDEX_, $INDEX); Rename($INDEXINDEX_, $INDEXINDEX); Rename($HASH_, $HASH); Rename($WORDLIST_, $WORDLIST); Rename($PHRASE_, $PHRASE); Rename($PHRASEINDEX_, $PHRASEINDEX); # remove the lock file unlink $LOCKFILE; } # Set a file to indentify byte order sub put_endian_stamp () { if (is_big_endian()) { open(TMP, ">>$BIGENDIAN"); } else { open(TMP, ">>$LITTLEENDIAN"); } close(TMP); } # output NMZ.msg sub put_lock_msg () { write_message_to_file("$LOCKMSGFILE.ja", $LOCK_MSG_JA); write_message_to_file("$LOCKMSGFILE.en", $LOCK_MSG_EN); } # output NMZ.body sub put_body_msg () { write_message_to_file("$BODYMSGFILE.ja", $BODY_MSG_JA); write_message_to_file("$BODYMSGFILE.en", $BODY_MSG_EN); } # output NMZ.body and etc. sub write_message_to_file ($$) { my ($full_path_name, $msg) = @_; if (! -e $full_path_name) { my ($template, $fname); $full_path_name =~ /.*\Q$PSC\E(.*)$/; $fname = $1; if ( -e "$LIBDIR$PSC$fname") { $template = "$LIBDIR$PSC$fname"; } else { $template = "$LIBDIR2$PSC$fname"; } if (-e $template) { my ($buf); open(TEMPLATE, $template) || die "$template: $!\n"; if ($LANGUAGE eq "ja") { open(OUTPUT ,"|$NKF -j >$full_path_name") || die "$full_path_name: $!\n"; } else { open(OUTPUT ,">$full_path_name") || die "$full_path_name: $!\n"; } $buf = join('',