From: Magnus Hagander Date: Sun, 27 Nov 2011 17:57:30 +0000 (+0000) Subject: Add a sitemap based site indexer, that can index the new website X-Git-Url: http://git.postgresql.org/gitweb/?a=commitdiff_plain;h=f8525539d10bad8cf8f9a6ac8048f31f570abfa7;p=pgweb-old.git Add a sitemap based site indexer, that can index the new website git-svn-id: file:///Users/dpage/pgweb/svn-repo/trunk@2912 8f5c7a92-453e-0410-a47f-ad33c8a6b003 --- diff --git a/portal/tools/search/classes/HttpRequest.class.php b/portal/tools/search/classes/HttpRequest.class.php index 96f3f996..bb1dbe0d 100644 --- a/portal/tools/search/classes/HttpRequest.class.php +++ b/portal/tools/search/classes/HttpRequest.class.php @@ -105,7 +105,11 @@ class HttpRequest { } function getHeader($header) { - return $this->headers[strtolower($header)]; + if (array_key_exists(strtolower($header), $this->headers)) { + return $this->headers[strtolower($header)]; + } else { + return NULL; + } } } ?> diff --git a/portal/tools/search/classes/SitemapSiteIndexer.class.php b/portal/tools/search/classes/SitemapSiteIndexer.class.php new file mode 100644 index 00000000..e79041e9 --- /dev/null +++ b/portal/tools/search/classes/SitemapSiteIndexer.class.php @@ -0,0 +1,99 @@ +siteid = $siteid; + $this->indexer = new WebPageIndexer($siteid); + + $this->baseurl = $baseurl; + $urlinfo = parse_url($baseurl); + $this->host = $urlinfo['host']; + if (isset($urlinfo['port'])) $this->port = $urlinfo['port']; + $this->http = new HttpRequest($this->host, $this->port); + } + + function IndexSitemapSite() { + + $xml = simplexml_load_file($this->baseurl); + if ($xml->getName() != "urlset") { + print "Root element is not urlset!\n"; + exit(1); + } + + // Fetch existing list + $lastdates = SearchDB::QueryAssociative("SELECT suburl,extract(epoch from lastscanned) FROM webpages_scantime WHERE site=" . $this->siteid); + + foreach ($xml->children() as $child) { + if ($child->getName() != "url") { + print "Child element is not url!\n"; + exit(1); + } + + $urlpiece = substr($child->loc, 25); // Skip past http://www.postgresql.org <-- FIXME: should not be hardcoded + + $lastscan = array_key_exists($urlpiece, $lastdates) ? $lastdates[$urlpiece] : 0; + if ($this->http->RequestURL($urlpiece, $lastscan)) { + if ($this->http->status == 304) { + print "Page $urlpiece not changed.\n"; + } + else if ($this->http->status == 404) { + print "Page $urlpiece in sitemap, but does not exist!\n"; + SearchDB::ExecuteStatement("DELETE FROM webpages_scantime WHERE site=" . $this->siteid . " AND suburl='" . $urlpiece . "'"); + SearchDB::ExecuteStatement("DELETE FROM webpages WHERE site=" . $this->siteid . " AND suburl='" . $urlpiece . "'"); + continue; + } + else if ($this->http->status != 200) { + print "Page $urlpiece returned invalid status " . $this->http->status . "!\n"; + exit(1); + } + else { + if ($this->http->getHeader("Last-Modified")) { + $lastmod = strtotime($this->http->getHeader("Last-Modified")); + } else { + $lastmod = time(); + } + $contenttype = $this->http->getHeader("Content-type"); + $contenttype = preg_replace('/,.*$/', '', $contenttype); + $contenttype = preg_replace('/;.*$/', '', $contenttype); + if (!$this->indexer->WantIndexFile($urlpiece,$lastmod,$contenttype)) { + print "Don't want to index $urlpiece\n"; + continue; + } + if (!$this->indexer->IndexSinglePage($this->http->responsetext, 1, $urlpiece, $lastmod, $contenttype)) { + print "Failed to index $urlpiece!\n"; + continue; + } + $this->count++; + } // http 200 + } + else { + print "Strange error for $urlpiece\n"; + exit(1); + } + + // Add it to the list of indexed URLs, so we can figure out + // if it should be removed. + $this->found[$urlpiece] = 1; + } + + foreach (SearchDB::QuerySingleColumn("SELECT suburl FROM webpages WHERE site=" . $this->siteid) as $suburl) { + if (!$this->found[$suburl]) { + print "Page removed: $suburl\n"; + SearchDB::ExecuteStatement("DELETE FROM webpages WHERE site=" . $this->siteid . " AND suburl='" . $suburl . "'"); + SearchDB::ExecuteStatement("DELETE FROM webpages_scantime WHERE site=" . $this->siteid . " AND suburl='" . $suburl . "'"); + $this->count++; + } + } + } + + function count() { + return $this->count; + } + +} +?> diff --git a/portal/tools/search/mainsite.php b/portal/tools/search/mainsite.php index 3c958e44..a41e8812 100755 --- a/portal/tools/search/mainsite.php +++ b/portal/tools/search/mainsite.php @@ -4,14 +4,15 @@ ini_set("memory_limit", "32M"); if (strlen($argv[1]) < 1) { - print "Usage: mainsite.php \n"; + print "Usage: mainsite.php \n"; exit(1); } $id = SearchDB::QueryScalarValue("SELECT id FROM sites WHERE baseurl='http://www.postgresql.org'"); - $psi = new PgSiteIndexer($id); - $psi->IndexStaticSite($argv[1]); + $psi = new SitemapSiteIndexer($id, $argv[1]); + $psi->IndexSitemapSite(); + print "Finished indexing " . $psi->count() . " pages on static site.\n"; if ($psi->count() > 0) { SearchDB::ExecuteStatement("UPDATE sites SET pagecount=(SELECT count(*) FROM webpages WHERE site=$id) WHERE id=$id");