#!/usr/bin/env ruby # # Author: Martin Matusiak # Licensed under the GNU Public License, version 3. # # revision 3 - allow spaces in urls # revision 2 - introduce buffering to handle large files out of memory # revision 1 - performance hacking: output entries immediately, only sort on # emailcsv require "optparse" email = /([a-zA-Z0-9_\.-])+@(([a-zA-Z0-9-])+\.)+([a-zA-Z0-9]{2,4})+/m url_orig = /([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9\/](([A-Za-z0-9$_.+!*,;\/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;\/?:@&~=%-]{0,1000}))?)/m url = /([A-Za-z][A-Za-z0-9+.-]{1,120}:\/\/(([A-Za-z0-9$_.+!*,;\/?:@&~(){}\[\]=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9 $_.+!*,;\/?:@&~(){}\[\]=%-]{0,1000}))?)/m pattern=url joinlines=false emailcsv=false buffer_size=10*1024 hardlimit=100 ## parse options OptionParser.new do |opts| opts.on("--url", "url format") do |v| pattern = url end opts.on("--dat", "firefox history.dat format = \\\\n in urls") do |v| joinlines = true end opts.on("--email", "email format") do |v| pattern = email end opts.on("--emailcsv", "csv output (facebook contact import)") do |v| pattern = email emailcsv = true end end.parse! entries = [] previous = "" while string = previous + STDIN.read(buffer_size).to_s and string.length > previous.length do partial = "" joinlines and string.gsub!(/\\\n/, "") while string and m = pattern.match(string) and m.size > 0 do m.end(0) == string.length and partial = m.to_s if partial.empty? if emailcsv entries << m.to_s else puts m.to_s end end pos = m.end(0) string = string[pos..-1] end if !partial.empty? previous = partial else if hardlimit < string.length previous = string[string.length-hardlimit..-1] else previous = string end end end # special stuff for csv email output if !entries.empty? entries = entries.sort{ |a, b| a.downcase <=> b.downcase }.uniq puts '"Email Address","Formatted Name"' entries.each { |i| puts '"' + i + '",""' } end