The Code Sample of Indexing


The following program is used to implement the button “Index web pages” of the previous interface. It adds the URL to the inverted list if its contents include the keyword entered from the Web. If the keyword does not exist in the inverted list, add it and the URL to the list if the URL contents include the keyword.

You may need to create the empty file result.txt initially, and open it via “chmod 777 result.txt”. Otherwise, the file result.txt may not be written.

http://undcemcs01.und.edu/~wen.chen.hu/course/515/2/Spider.php
<?php

 # SQL> CREATE TABLE  keywords (
 #   2    kwID     INT AUTO_INCREMENT PRIMARY KEY,
 #   3    keyword  VARCHAR(64) );
 #
 # SQL> CREATE TABLE  url_title (
 #   2    urlID  INT AUTO_INCREMENT PRIMARY KEY,
 #   3    url    VARCHAR(128),
 #   4    title  VARCHAR(128) );
 #
 # SQL> CREATE TABLE  www_index (
 #   2    kwID   INT,
 #   3    urlID  INT,
 #   4    PRIMARY KEY ( kwID, urlID ),
 #   5    FOREIGN KEY ( kwID  ) REFERENCES  keywords  ( kwID ),
 #   6    FOREIGN KEY ( urlID ) REFERENCES  url_title ( urlID ) );

 include 'password.php';     // Containing only one line: $password="your-pw";
 $keyword  = $argv[1];
 $URL      = $argv[2];
 $username = "your-id@undcsmysql";
 $database = "your-db";
 $host     = "undcsmysql.mysql.database.azure.com";
 $conn     = new mysqli( $host, $username, $password, $database );

 if ( $conn->connect_error )
   die( 'Could not connect: ' . $conn->connect_error );

 # Dump the source code to the file result.txt.
 $cmd = "lynx -dump -source '" . $URL . "' > result.txt";
 system( "chmod 777 result.txt ../2/" );
 system( $cmd );
 system( "chmod 755 ../2/" );

 # Find the page title by using a regular expression.
 $file    = file_get_contents( "result.txt" );
 $pattern = '/<title>.*?<\/title>/';
 preg_match( $pattern, $file, $matches );
 $title   = strip_tags( $matches[0] );

 # Check whether the page contains the keyword.
 $file = fopen( "result.txt", "r" ) or
   exit( "Unable to open file!" );
 $found = false;
 while ( !feof( $file ) ) {
   $line = fgets( $file );
   if ( substr_count( $line, $keyword ) != 0 ) {
     $found = true;
     break;
   }
 }
 fclose( $file );

 # Find the ID of the input keyword from the keywords table.
 $sql = "SELECT kwID FROM keywords WHERE keyword='$keyword';";
 echo( $sql . "\n\n" );
 $result = $conn->query( $sql );
 if ( $result->num_rows > 0 )
   while( $row = $result->fetch_assoc( ) )
     $kwID = $row['kwID'];
 else {
   $sql = "INSERT INTO keywords( keyword ) VALUES ( '$keyword' );";
   echo( $sql . "\n\n" );
   $conn->query( $sql );
   $sql = "SELECT kwID FROM keywords WHERE keyword='$keyword';";
   echo( $sql . "\n\n" );
   $result = $conn->query( $sql );
   if ( $result->num_rows > 0 )
     while( $row = $result->fetch_assoc( ) )
       $kwID = $row['kwID'];
 }

 # Find the ID of the input URL from the url_title table.
 $sql = "SELECT urlID FROM url_title WHERE url='$URL';";
 echo( $sql . "\n\n" );
 $result = $conn->query( $sql );
 if ( $result->num_rows > 0 )
   while( $row = $result->fetch_assoc( ) )
     $urlID = $row['urlID'];
 else {
   $sql = "INSERT INTO url_title( url, title ) VALUES ( '$URL', '$title' );";
   echo( $sql . "\n\n" );
   $conn->query( $sql );
   $sql = "SELECT urlID FROM url_title WHERE url='$URL';";
   echo( $sql . "\n\n" );
   $result = $conn->query( $sql );
   if ( $result->num_rows > 0 )
     while( $row = $result->fetch_assoc( ) )
       $urlID = $row['urlID'];
 }

 # Update the inverted list if the keyword is found.
 if ( $found == true ) {
   $sql = "INSERT INTO www_index VALUES ( '$kwID', '$urlID' );";
   echo(  $sql . "\n\n" );
   $conn->query( $sql );
 }

 $conn->close( );

?>