4 # urlgrep[.awk] v1 - print http|ftp|rstp|mms:// text entries in text
5 # urlgrep [-v dup=[01]] [[+] regex] [! regex] [file|-] [files..]
6 # no files specified = read from stdin
7 # w/o files, read stdin
9 # todo: doing hpfetch with v2 of this
10 # todo: v2: multiple positive/negative regex support
11 # todo: v2: recognition: http://blah.com/path) text
13 # by xmb<xmb@skilled.ch> - localhack
15 # 24.12.2004, 26.12.2004, 02.01.2005, 26.01.2005(rstp support)
16 # 28.01.2005(grep like syntax and ! <notmatch>)
17 # 09.02.2005(fixed args, others, renamed from httpgrep to urlgrep)
18 # 12.02.2005(mms support), 28.02.2005(robustness update)
19 # 07.03.2005(conf enhansements)
20 # 24.03.2005(regex/gsub enh's, more cmd examples)
21 # 01.05.2005(-- stop arg)
24 # $ cp urlgrep.awk /usr/local/bin/urlgrep
26 # $ urlgrep regex ! regex files
29 # $ curl -s www.blah.ch | urlgrep
30 # $ wget -qO- www.microsoft.com | urlgrep [| xargs curl -sD-]
31 # $ curl -s www.apple.com | urlgrep ! 200[25] +[^m]/$
33 # $ curl -s http://e2e.serveftp.net/wee/ | urlgrep wmv$ | xargs -n1 mplayer
36 ## MMMM! example site aightgenossen.ch, play media files with max 3 mplayers@1p
37 # $ wget xmb.ath.cx/threads.sh
38 # $ . threads.sh ; threads_max=3
39 # $ getwhile() { [[ ! $1 ]] && echo $last && return 1; last=$( curl -s $@ |
40 # > urlgrep ) match=$( grep -E '(wm[av]|rm|mov|avi)$|^rtsp' ) && getwhile }
42 # curl -s aightgenossen.ch/index.php/m=multimedia
46 if (! dup && dup != "0") dup = 1 # filter duplicates, by default on
47 if (! dup_domain && dup_domain != "0") dup_domain = 0 # filter duplicates
48 # ^ by domain but show whole url matching first
49 if (! show_dom_only && show_dom_only != "0") show_dom_only = 0 # show only
50 # ^ the domain/host of the url
52 # map env variables, accessable with URLGREP_ prefix
53 # eg, NODUP is the env variable URLGREP_NODUP
60 DUP_DOMAIN dup_domain . \
61 DOMAIN_ONLY domain_only .\
64 for (arg = 1; arg <= narg; arg++)
65 if (ENVIRON[ Args[i] ]) {
66 if (Args[i+2] == ".") CONF[ Args[i+1] ] = ENVIRON[ Args[i] ]
67 else CONF[ Args[i+1] ] = Args[i+2]
72 # argument parsing, this uses quite some CPU, gah, fixed
74 #if (i == 1 || ARGV[i] ~ /^\+/) grep = get_arg(i)
75 #if (ARGV[i] ~ /^!/) nogrep = get_arg(i)
77 if (ARGV[i] ~ /^!/) CONF["nogrep"] = get_arg(i)
78 else CONF["grep"] = get_arg(i)
80 if (skip) { skip = 0; delete ARGV[i]; delete ARGV[++i] }
83 if (ARGV[i + 1] !~ /^[+!-]/) break # stop after having enough regexes
86 if (CONF["grep"]) grep = CONF["grep"]
87 if (CONF["nogrep"]) nogrep = CONF["grep"]
88 if (CONF["url"]) url = CONF["url"]
89 if (CONF["DEBUG"]) DEBUG = CONF["DEBUG"]
91 #if (ENVIRON["URLGREP_NODUP"]) dup = 0
92 #if (ENVIRON["URLGREP_GREP"]) grep = ENVIRON["URLGREP_GREP"]
93 #if (ENVIRON["URLGREP_NOGREP"]) nogrep = ENVIRON["URLGREP_NOGREP"]
94 #if (ENVIRON["URLGREP_URL"]) url = ENVIRON["URLGREP_URL"]
95 #if (ENVIRON["URLGREP_DEBUG"]) DEBUG = ENVIRON["URLGREP_DEBUG"]
96 #if (ENVIRON["URLGREP_QUOT"]) quote = ENVIRON["URLGREP_QUOT"]
97 #if (ENVIRON["URLGREP_DUP_DOMAIN]) dup_domain = ENVIRON["URLGREP_DUP_DOMAIN"]
100 r_h = "(https?|ftp|rtsp|mms)://" # head(er)
101 #r_h = "[a-zA-Z]+://"
102 #r_m = r_h " ?['\"]?[^ \t'\"<>]+"
103 r_m = r_h " *['\"]?[^ \t'\"<>]+\\.[^ \t'\"<>]+" # match
105 if (DEBUG) printf "url = %s, match '%s', dont match '%s'\n",
106 (url) ? url : "none", grep, nogrep >err
109 #DEBUG > 2 # small slowdown
112 if (DEBUG > 1) print "LINE:", $0
113 #while ($0 ~ r_h "[^\"'\t<][ \t]*[^ \t]+\\.") { # original way uses r_h
115 s = substr($0, match($0, r_m), RLENGTH)
117 if (dup && ! Seen[s]++)
130 printf "urlgrep: %d total", total >err
132 printf ", %d printed, %d duplicates", count, found_dup >err
133 if (found_dup + count < total)
134 printf ", %d not matched", total - (found_dup + count) >err
139 # get either current or next arg
140 function get_arg(p ,tmp) {
141 if (ARGV[p] ~ /^[+!-]/) {
142 if (length(ARGV[p]) > 1) {
143 tmp = substr(ARGV[p], 2)
154 # main match/nomatch check & print function
156 if ((grep && s !~ grep) || nogrep && s ~ nogrep) return 1
158 gsub(/[ \t'"]*/, "", s) # :;, are newly added # ;,
159 if (quote) print "'" s "'" # small slowdown