Meandering Your Google Neighborhood

advanced tipscreenshot tip65.gif

Google Neighborhood attempts to detangle the Web by building a "neighborhood" of sites around a URL.
link

It's called the World Wide Web, not the World Wide Straight Line. Sites link to other sites, building a "web" of sites. And what a tangled web we weave.

Google Neighborhood attempts to detangle some small portion of the Web by using the Google API to find sites related to a URL you provide, scraping the links on the sites returned, and building a "neighborhood" of sites that link both the original URL and each other.

If you'd like to give this tip a whirl without having to run it yourself, there's a live version available
at http://diveintomark.org/archives/2002/06/04.html#who_are_the_people_in_your_neighborhood. The source code (included below) for Google Neighborhood is available for download from http://diveintomark.org/projects/misc/neighbor.py.txt.

The Code

Google Neighborhood is written in the Python (http://www.python.org) coding language. Your system will need to have Python installed for you to run this tip.

"""
neighbor.cgi Blogroll finder and aggregator
"""
_ _author_ _ = "Mark Pilgrim (f8dy@diveintomark.org)"
_ _copyright_ _ = "Copyright 2002, Mark Pilgrim"
_ _license_ _ = "Python"
try:
 import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
 timeoutsocket.setDefaultSocketTimeout(10)
except:
 pass import urllib, urlparse, os, time, operator, sys, pickle, re, cgi, time from sgmllib import SGMLParser from threading import *
BUFFERSIZE = 1024
IGNOREEXTS = ('.xml', '.opml', '.rss', '.rdf', '.pdf', '.doc')
INCLUDEEXTS = ('', '.html', '.htm', '.shtml', '.php', '.asp', '.jsp')
IGNOREDOMAINS = ('cgi.alexa.com', 'adserver1.backbeatmedia.com', 
'ask.slashdot.org', 'freshmeat.net', 'readroom.ipl.org', 'amazon.com', 
'ringsurf.com')
def prettyURL(url):
 protocol, domain, path, params, query, fragment = urlparse.urlparse(url)
 if path == '/':
 path = ''
 return urlparse.urlunparse(('',domain,path,'','','')).replace('//','')
def simplifyURL(url):
 url = url.replace('www.', '')
 url = url.replace('/coming.html', '/')
 protocol, domain, path, params, query, fragment = urlparse.urlparse(url)
 if path == '':
 url = url + '/'
 return url class MinimalURLOpener(urllib.FancyURLopener):
 def _ _init_ _(self, *args):
 apply(urllib.FancyURLopener._ _init_ _, (self,) + args)
 self.addheaders = [('User-agent', '')]
 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
 pass class BlogrollParser(SGMLParser):
 def _ _init_ _(self, url):
 SGMLParser._ _init_ _(self)
 self.url = url
 self.reset( )
 def reset(self):
 SGMLParser.reset(self)
 self.possible = []
 self.blogroll = []
 self.ina = 0
 def _goodlink(self, href):
 protocol, domain, path, params, query, fragment = urlparse.urlparse(href)
 if protocol.lower( ) «» 'http': return 0
 if self.url.find(domain) «» -1: return 0
 if domain in IGNOREDOMAINS: return 0
 if domain.find(':5335') «» -1: return 0
 if domain.find('.google') «» -1: return 0
 if fragment: return 0
 shortpath, ext = os.path.splitext(path)
 ext = ext.lower( )
 if ext in INCLUDEEXTS: return 1
 if ext.lower( ) in IGNOREEXTS: return 0
 # more rules here?
 return 1
 def _confirmpossibles(self):
 if len(self.possible) »= 4:
 for url in self.possible:
 if url not in self.blogroll:
 self.blogroll.append(url)
 self.possible = []
 def start_a(self, attrs):
 self.ina = 1
 hreflist = [e[1] for e in attrs if e[0]=='href']
 if not hreflist: return
 href = simplifyURL(hreflist[0])
 if self._goodlink(href):
 self.possible.append(href)
 def end_a(self):
 self.ina = 0
 def handle_data(self, data):
 if self.ina: return
 if data.strip( ):
 self._confirmpossibles( )
 def end_html(self, attrs):
 self.confirmpossibles( )
def getRadioBlogroll(url):
 try:
 usock = MinimalURLOpener( ).open('%s/gems/mySubscriptions.opml' % url)
 opmlSource = usock.read( )
 usock.close( )
 except:
 return []
 if opmlSource.find('«opml') == -1: return []
 radioBlogroll = []
 start = 0
 while 1:
 p = opmlSource.find('htmlUrl="', start)
 if p == -1: break
 refurl = opmlSource[p:p+100].split('"')[1]
 radioBlogroll.append(refurl)
 start = p + len(refurl) + 10
 return radioBlogroll def getBlogroll(url):
 if url[:7] «» 'http://':
 url = 'http://' + url
 radioBlogroll = getRadioBlogroll(url)
 if radioBlogroll:
 return radioBlogroll
 parser = BlogrollParser(url)
 try:
 usock = MinimalURLOpener( ).open(url)
 htmlSource = usock.read( )
 usock.close( )
 except:
 return []
 parser.feed(htmlSource)
 return parser.blogroll class BlogrollThread(Thread):
 def _ _init_ _(self, master, url):
 Thread._ _init_ _(self)
 self.master = master
 self.url = url
 def run(self):
 self.master.callback(self.url, getBlogroll(self.url))
class BlogrollThreadMaster:
 def _ _init_ _(self, url, recurse):
 self.blogrollDict = {}
 self.done = 0
 if type(url)==type(''):
 blogroll = getBlogroll(url)
 else:
 blogroll = url
 self.run(blogroll, recurse)
 def callback(self, url, blogroll):
 if not self.done:
 self.blogrollDict[url] = blogroll
 def run(self, blogroll, recurse):
 start = 0
 end = 5
 while 1:
 threads = []
 for url in blogroll[start:end]:
 if not self.blogrollDict.has_key(url):
 t = BlogrollThread(self, url)
 threads.append(t)
 for t in threads:
 t.start( )
 time.sleep(0.000001)
 for t in threads:
 time.sleep(0.000001)
 t.join(10)
 start += 5
 end += 5
 if start » len(blogroll): break
 if recurse » 1:
 masterlist = reduce(operator.add, self.blogrollDict.values( ))
 newlist = [url for url in masterlist if not self.blogrollDict.has_key(url)]
 self.run(newlist, recurse - 1)
 else:
 self.done = 1
def sortBlogrollData(blogrollDict):
 sortD = {}
 for blogroll in blogrollDict.values( ):
 for url in blogroll:
 sortD[url] = sortD.setdefault(url, 0) + 1
 sortI = [(v, k) for k, v in sortD.items( )]
 sortI.sort( )
 sortI.reverse( )
 return sortI def trimdata(sortI, cutoff):
 return [(c, url) for c, url in sortI if c »= cutoff]
def getRelated(url):
 import google
 results = []
 start = 0
 for i in range(3):
 data = google.doGoogleSearch('related:%s' % url, start)
 results.extend([oneResult.URL for oneResult in data.results])
 start += 10
 if len(data.results) « 10: break
 return results def getNeighborhood(baseURL):
 relatedList = getRelated(baseURL)
 blogrollDict = BlogrollThreadMaster(relatedList, 1).blogrollDict
 neighborhood = sortBlogrollData(blogrollDict)
 neighborhood = trimdata(neighborhood, 2)
 neighborhood = [(c,url, prettyURL(url)) for c,url in neighborhood]
 return neighborhood def render_html(baseURL, data):
 output = []
 output.append("""
«table summary="neighborhood for %s"»
«caption»Neighborhood for %s«/caption»
«thead»
«tr»
«th scope="col"»Name«/th»
«th scope="col"»Links«/th»
«th shope="col"»Explore«/th»
«/tr»
«/thead»
«tbody»""" % (cgi.escape(prettyURL(baseURL)), cgi.escape(prettyURL(baseURL))))
 for c, url, title in data:
 output.append("""«tr»«td»
«a href="%s"»%s«/a»«/td»«td»%s«/td»«td»«a 
href="%s"»explore«/a»«/td»«/tr»""" % (url, title, c, 'http://diveintomark.
org/cgi-bin/neighborhood.cgi?url=%s' % cgi.escape(url)))
 output.append("""
«/tbody»
«/table»""")
 return "".join(output)
def render_rss(baseURL, data):
 title = prettyURL(baseURL)
 channeltitle = "%s neighborhood" % title
 localtime = time.strftime('%Y-%m-%dT%H:%M:%S-05:00', time.localtime( ))
 output = []
 output.append("""«?xml version="1.0"?»
«rdf:RDF xmlns="http://purl.org/rss/1.0/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://
purl.org/dc/elements/1.1/" xmlns:sy="http://purl.org/rss/1.0/modules/
syndication/" xmlns:admin="http://webns.net/mvcb/"»
«channel rdf:about="%(baseURL)s"»
«title»%(channeltitle)s«/title»
«link»%(baseURL)s«/link»
«description»Sites in the virtual neighborhood of %(title)s«/description»
«language»en-us«/language»
«lastBuildDate»%(localtime)s«/lastBuildDate»
«pubDate»%(localtime)s«/pubDate»
«admin:generatorAgent rdf:resource="http://divintomark.org/cgi-bin/neighborhood.cgi/?v=1.1" /»
«admin:errorReportsTo rdf:resource="mailto:f8dy@diveintomark.org"/»
«sy:updatePeriod»weekly«/sy:updatePeriod»
«sy:updateFrequency»1«/sy:updateFrequency»
«sy:updateBase»2000-01-01T12:00+00:00«/sy:updateBase»
«items»
«rdf:Seq»
""" % locals( ))
##"""
 for c, url, title in data:
 output.append("""«rdf:li rdf:resource="%s" /»
""" % url)
 output.append("""«/rdf:Seq»
«/items»
«/channel»
""")
 for c, url, title in data:
 output.append("""«item rdf:about="%(url)s"»
«title»%(title)s«/title»
«link»%(url)s«/link»
«description»%(c)s links«/description»
«/item»
""" % locals( ))
 output.append("""«/rdf:RDF»""")
 return "".join(output)
if _ _name_ _ == '_ _main_ _':
 print render_html(getNeighborhood(sys.argv[1]))

Running the Tip

Google Neighborhood runs as a CGI script in your browser. Provide it the URL you're interested in using as the center, select HTML or RSS output (see also [Tip #82]), and hit the "Meander" button.

You'll need an HTML form to call Google Neighborhood. Here's a simple one:

«form action="/cgi-bin/neighborhood.cgi" method="get"»
URL: «input name="url" type="text" /»
«br /»
Output as: «input name="fl" type="radio" value="html" checked="true" /» HTML
«input name="fl" type="radio" value="rss" checked="true" /» RSS
«br /»
«input type="submit" value="Meander" /»
«/form»

Of course, you should alter the action= to point at the location in which you installed the CGI script.

Figure 6-8 shows a representation of Rael's (raelity.org's, to be precise) Google Neighborhood. Clicking on any of the links on the left transports you to the URL shown. More interestingly, the "explore" link shifts your point-of-view, centering the neighborhood on the associated URL. You can thus meander a neighborhood to your heart's content; don't be surprised, especially in the blogging world, if you keep coming across the same links. Speaking of links, the number listed beneath the "Links" heading represents the number of links the associated site has to the currently focused site.

Figure 6-8. raelity.org's Google Neighborhood
screenshot google-tips-0608.gif

Tiping the Tip

If you want to tip this tip you can concentrate your efforts on a small block of code specifying what file extensions you want to include and exclude, as well as what domains you want to exclude when calculating your neighborhoods:

IGNOREEXTS = ('.xml', '.opml', '.rss', '.rdf', '.pdf', '.doc')
INCLUDEEXTS = ('', '.html', '.htm', '.shtml', '.php', '.asp', '.jsp')
IGNOREDOMAINS = ('cgi.alexa.com', 'adserver1.backbeatmedia.com', 'ask.
slashdot.org','freshmeat.net', 'readroom.ipl.org', 'amazon.com',
'ringsurf.com')
Noticing/ignoring file extensions

The way the tip is currently written, the neighborhood is built around pretty standard files. However, you could create a neighborhood of sites served by PHP (http://www.php.net/), including only URLs with a PHP (.php) extension. Or perhaps your interest lies in Word documents and PDF files. You'd alter the code as follows:

IGNOREEXTS = ('.xml', '.opml', '.rss', '.rdf', '.html', '.htm', '.shtml', 
'.php', '.asp', '.jsp')
INCLUDEEXTS = ('', '.pdf', '.doc')
Ignoring domains

Sometimes when you're building a neighborhood you might notice that the same links are popping up again and again. They're not really part of the neighborhood but tend to be places that the web pages making up your neighborhood often link to. For example, most Blogger-based weblogs include a link to Blogger.com as a matter of course.

Exclude domains that hold no interest to you by adding them to the IGNOREDOMAINS list:

IGNOREDOMAINS = ('cgi.alexa.com', 'adserver1.backbeatmedia.com', 
'ask.slashdot.org', 'freshmeat.net', 'readroom.ipl.org', 'amazon.com', 
'ringsurf.com', 'blogger.com')

Google Neighborhood was written by Mark Pilgrim (http://diveintomark.org/).