#!/usr/bin/awk -f # /* 24.12.2004 # urlgrep[.awk] v1 - print http|ftp|rstp|mms:// text entries in text # urlgrep [-v dup=[01]] [[+] regex] [! regex] [file|-] [files..] # no files specified = read from stdin # w/o files, read stdin # todo: doing hpfetch with v2 of this # todo: v2: multiple positive/negative regex support # todo: v2: recognition: http://blah.com/path) text # by xmb - localhack # 24.12.2004, 26.12.2004, 02.01.2005, 26.01.2005(rstp support) # 28.01.2005(grep like syntax and ! ) # 09.02.2005(fixed args, others, renamed from httpgrep to urlgrep) # 12.02.2005(mms support), 28.02.2005(robustness update) # 07.03.2005(conf enhansements) # 24.03.2005(regex/gsub enh's, more cmd examples) # 01.05.2005(-- stop arg) # */ # $ cp urlgrep.awk /usr/local/bin/urlgrep # $ urlgrep regex ! regex files # $ urlgrep '' file # $ urlgrep . file # $ curl -s www.blah.ch | urlgrep # $ wget -qO- www.microsoft.com | urlgrep [| xargs curl -sD-] # $ curl -s www.apple.com | urlgrep ! 200[25] +[^m]/$ # $ curl -s http://e2e.serveftp.net/wee/ | urlgrep wmv$ | xargs -n1 mplayer # $ ## MMMM! example site aightgenossen.ch, play media files with max 3 mplayers@1p # $ wget xmb.ath.cx/threads.sh # $ . threads.sh ; threads_max=3 # $ getwhile() { [[ ! $1 ]] && echo $last && return 1; last=$( curl -s $@ | # > urlgrep ) match=$( grep -E '(wm[av]|rm|mov|avi)$|^rtsp' ) && getwhile } # # v2 signal todo # curl -s aightgenossen.ch/index.php/m=multimedia # .. unfinished BEGIN { if (! dup && dup != "0") dup = 1 # filter duplicates, by default on if (! dup_domain && dup_domain != "0") dup_domain = 0 # filter duplicates # ^ by domain but show whole url matching first if (! show_dom_only && show_dom_only != "0") show_dom_only = 0 # show only # ^ the domain/host of the url # map env variables, accessable with URLGREP_ prefix # eg, NODUP is the env variable URLGREP_NODUP narg = split("\ NODUP dup 0 \ GREP grep . \ NOGREP nogrep . \ URL url . \ DEBUG DEBUG .\ DUP_DOMAIN dup_domain . \ DOMAIN_ONLY domain_only .\ ", Args) for (arg = 1; arg <= narg; arg++) if (ENVIRON[ Args[i] ]) { if (Args[i+2] == ".") CONF[ Args[i+1] ] = ENVIRON[ Args[i] ] else CONF[ Args[i+1] ] = Args[i+2] i += 2 } # argument parsing, this uses quite some CPU, gah, fixed while (ARGV[++i]) { #if (i == 1 || ARGV[i] ~ /^\+/) grep = get_arg(i) #if (ARGV[i] ~ /^!/) nogrep = get_arg(i) if (ARGV[i] ~ /^!/) CONF["nogrep"] = get_arg(i) else CONF["grep"] = get_arg(i) if (skip) { skip = 0; delete ARGV[i]; delete ARGV[++i] } else delete ARGV[i] if (ARGV[i + 1] !~ /^[+!-]/) break # stop after having enough regexes } if (CONF["grep"]) grep = CONF["grep"] if (CONF["nogrep"]) nogrep = CONF["grep"] if (CONF["url"]) url = CONF["url"] if (CONF["DEBUG"]) DEBUG = CONF["DEBUG"] #if (ENVIRON["URLGREP_NODUP"]) dup = 0 #if (ENVIRON["URLGREP_GREP"]) grep = ENVIRON["URLGREP_GREP"] #if (ENVIRON["URLGREP_NOGREP"]) nogrep = ENVIRON["URLGREP_NOGREP"] #if (ENVIRON["URLGREP_URL"]) url = ENVIRON["URLGREP_URL"] #if (ENVIRON["URLGREP_DEBUG"]) DEBUG = ENVIRON["URLGREP_DEBUG"] #if (ENVIRON["URLGREP_QUOT"]) quote = ENVIRON["URLGREP_QUOT"] #if (ENVIRON["URLGREP_DUP_DOMAIN]) dup_domain = ENVIRON["URLGREP_DUP_DOMAIN"] err = "/dev/stderr" r_h = "(https?|ftp|rtsp|mms)://" # head(er) #r_h = "[a-zA-Z]+://" #r_m = r_h " ?['\"]?[^ \t'\"<>]+" r_m = r_h " *['\"]?[^ \t'\"<>]+\\.[^ \t'\"<>]+" # match if (DEBUG) printf "url = %s, match '%s', dont match '%s'\n", (url) ? url : "none", grep, nogrep >err } #DEBUG > 2 # small slowdown $0 ~ r_h { if (DEBUG > 1) print "LINE:", $0 #while ($0 ~ r_h "[^\"'\t<][ \t]*[^ \t]+\\.") { # original way uses r_h while ($0 ~ r_m) { s = substr($0, match($0, r_m), RLENGTH) total++ if (dup && ! Seen[s]++) _p(s) else if (! dup) _p(s) else found_dup++ sub(r_m, "") } } END { fflush() if (! count) exit 3 printf "urlgrep: %d total", total >err if (found_dup) printf ", %d printed, %d duplicates", count, found_dup >err if (found_dup + count < total) printf ", %d not matched", total - (found_dup + count) >err printf "\n" >err fflush() } # get either current or next arg function get_arg(p ,tmp) { if (ARGV[p] ~ /^[+!-]/) { if (length(ARGV[p]) > 1) { tmp = substr(ARGV[p], 2) sub(/^ */, "", tmp) return tmp } else { skip = 1 return ARGV[p + 1] } } else return ARGV[p] } # main match/nomatch check & print function function _p(s) { if ((grep && s !~ grep) || nogrep && s ~ nogrep) return 1 gsub(/[ \t'"]*/, "", s) # :;, are newly added # ;, if (quote) print "'" s "'" # small slowdown else print s count++ }