X-Git-Url: http://git.grml.org/?a=blobdiff_plain;f=usr_bin%2Furlgrep.awk;fp=usr_bin%2Furlgrep.awk;h=0000000000000000000000000000000000000000;hb=c08b156abb0b2b1b1335cb1ada0c08758553238b;hp=32e3188e7964c1573557de8da8702fc6adcdc29e;hpb=b0c3e5ff760f99d6ff7721852bc1a0d73c5ea201;p=grml-scripts.git diff --git a/usr_bin/urlgrep.awk b/usr_bin/urlgrep.awk deleted file mode 100755 index 32e3188..0000000 --- a/usr_bin/urlgrep.awk +++ /dev/null @@ -1,163 +0,0 @@ -#!/usr/bin/awk -f - -# /* 24.12.2004 -# urlgrep[.awk] v1 - print http|ftp|rstp|mms:// text entries in text -# urlgrep [-v dup=[01]] [[+] regex] [! regex] [file|-] [files..] -# no files specified = read from stdin -# w/o files, read stdin - -# todo: doing hpfetch with v2 of this -# todo: v2: multiple positive/negative regex support -# todo: v2: recognition: http://blah.com/path) text - -# by xmb - localhack - -# 24.12.2004, 26.12.2004, 02.01.2005, 26.01.2005(rstp support) -# 28.01.2005(grep like syntax and ! ) -# 09.02.2005(fixed args, others, renamed from httpgrep to urlgrep) -# 12.02.2005(mms support), 28.02.2005(robustness update) -# 07.03.2005(conf enhansements) -# 24.03.2005(regex/gsub enh's, more cmd examples) -# 01.05.2005(-- stop arg) -# */ - -# $ cp urlgrep.awk /usr/local/bin/urlgrep - -# $ urlgrep regex ! regex files -# $ urlgrep '' file -# $ urlgrep . file -# $ curl -s www.blah.ch | urlgrep -# $ wget -qO- www.microsoft.com | urlgrep [| xargs curl -sD-] -# $ curl -s www.apple.com | urlgrep ! 200[25] +[^m]/$ - -# $ curl -s http://e2e.serveftp.net/wee/ | urlgrep wmv$ | xargs -n1 mplayer -# $ - -## MMMM! example site aightgenossen.ch, play media files with max 3 mplayers@1p -# $ wget xmb.ath.cx/threads.sh -# $ . threads.sh ; threads_max=3 -# $ getwhile() { [[ ! $1 ]] && echo $last && return 1; last=$( curl -s $@ | -# > urlgrep ) match=$( grep -E '(wm[av]|rm|mov|avi)$|^rtsp' ) && getwhile } -# # v2 signal todo -# curl -s aightgenossen.ch/index.php/m=multimedia -# .. unfinished - -BEGIN { - if (! dup && dup != "0") dup = 1 # filter duplicates, by default on - if (! dup_domain && dup_domain != "0") dup_domain = 0 # filter duplicates - # ^ by domain but show whole url matching first - if (! show_dom_only && show_dom_only != "0") show_dom_only = 0 # show only - # ^ the domain/host of the url - - # map env variables, accessable with URLGREP_ prefix - # eg, NODUP is the env variable URLGREP_NODUP - narg = split("\ - NODUP dup 0 \ - GREP grep . \ - NOGREP nogrep . \ - URL url . \ - DEBUG DEBUG .\ - DUP_DOMAIN dup_domain . \ - DOMAIN_ONLY domain_only .\ - ", Args) - - for (arg = 1; arg <= narg; arg++) - if (ENVIRON[ Args[i] ]) { - if (Args[i+2] == ".") CONF[ Args[i+1] ] = ENVIRON[ Args[i] ] - else CONF[ Args[i+1] ] = Args[i+2] - - i += 2 - } - - # argument parsing, this uses quite some CPU, gah, fixed - while (ARGV[++i]) { - #if (i == 1 || ARGV[i] ~ /^\+/) grep = get_arg(i) - #if (ARGV[i] ~ /^!/) nogrep = get_arg(i) - - if (ARGV[i] ~ /^!/) CONF["nogrep"] = get_arg(i) - else CONF["grep"] = get_arg(i) - - if (skip) { skip = 0; delete ARGV[i]; delete ARGV[++i] } - else delete ARGV[i] - - if (ARGV[i + 1] !~ /^[+!-]/) break # stop after having enough regexes - } - - if (CONF["grep"]) grep = CONF["grep"] - if (CONF["nogrep"]) nogrep = CONF["grep"] - if (CONF["url"]) url = CONF["url"] - if (CONF["DEBUG"]) DEBUG = CONF["DEBUG"] - - #if (ENVIRON["URLGREP_NODUP"]) dup = 0 - #if (ENVIRON["URLGREP_GREP"]) grep = ENVIRON["URLGREP_GREP"] - #if (ENVIRON["URLGREP_NOGREP"]) nogrep = ENVIRON["URLGREP_NOGREP"] - #if (ENVIRON["URLGREP_URL"]) url = ENVIRON["URLGREP_URL"] - #if (ENVIRON["URLGREP_DEBUG"]) DEBUG = ENVIRON["URLGREP_DEBUG"] - #if (ENVIRON["URLGREP_QUOT"]) quote = ENVIRON["URLGREP_QUOT"] - #if (ENVIRON["URLGREP_DUP_DOMAIN]) dup_domain = ENVIRON["URLGREP_DUP_DOMAIN"] - - err = "/dev/stderr" - r_h = "(https?|ftp|rtsp|mms)://" # head(er) - #r_h = "[a-zA-Z]+://" - #r_m = r_h " ?['\"]?[^ \t'\"<>]+" - r_m = r_h " *['\"]?[^ \t'\"<>]+\\.[^ \t'\"<>]+" # match - - if (DEBUG) printf "url = %s, match '%s', dont match '%s'\n", - (url) ? url : "none", grep, nogrep >err -} - -#DEBUG > 2 # small slowdown - -$0 ~ r_h { - if (DEBUG > 1) print "LINE:", $0 - #while ($0 ~ r_h "[^\"'\t<][ \t]*[^ \t]+\\.") { # original way uses r_h - while ($0 ~ r_m) { - s = substr($0, match($0, r_m), RLENGTH) - total++ - if (dup && ! Seen[s]++) - _p(s) - else if (! dup) - _p(s) - else - found_dup++ - sub(r_m, "") - } -} - -END { - fflush() - if (! count) exit 3 - printf "urlgrep: %d total", total >err - if (found_dup) - printf ", %d printed, %d duplicates", count, found_dup >err - if (found_dup + count < total) - printf ", %d not matched", total - (found_dup + count) >err - printf "\n" >err - fflush() -} - -# get either current or next arg -function get_arg(p ,tmp) { - if (ARGV[p] ~ /^[+!-]/) { - if (length(ARGV[p]) > 1) { - tmp = substr(ARGV[p], 2) - sub(/^ */, "", tmp) - return tmp - } else { - skip = 1 - return ARGV[p + 1] - } - } else - return ARGV[p] -} - -# main match/nomatch check & print function -function _p(s) { - if ((grep && s !~ grep) || nogrep && s ~ nogrep) return 1 - - gsub(/[ \t'"]*/, "", s) # :;, are newly added # ;, - if (quote) print "'" s "'" # small slowdown - else - print s - count++ -}