1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
<?php

/* (C) 2010 - Antonio Ognio <[email protected]> */
/* This script scraps concert setlist data for Dream Theater in 2009 from setlist.fm */
/* MySQL schema: 

CREATE TABLE setlist (
  id INTEGER UNSIGNED NOT NULL AUTO_INCREMENT,
  concert_date DATE NOT NULL,
  venue VARCHAR(64) NOT NULL,
  position TINYINT UNSIGNED NOT NULL,
  song VARCHAR(64) NOT NULL,
  PRIMARY KEY(id),
  UNIQUE INDEX date_position_uidx (concert_date, position)
);

*/


function get_setlists_2009() {
  $baseurl = "http://www.setlist.fm/setlists/dream-theater-bd6a102.html";
  $regexp = '|href="(.*setlist/dream-theater/2009/.*);.*"|'; 
  $urls = array();
  for($page=1; $page<=8; $page++) {
    $contents = file("$baseurl?page=$page");
    foreach($contents as $line) {
      $line = chop($line);
      if (preg_match($regexp, $line, $matches)) {
        $u = str_replace('../', 'http://www.setlist.fm/', $matches[1]);
        if (in_array($u, $urls)===false) {
          $urls[] = $u;
        }
      }
    }
  }
  return $urls;
}

function grab_setlist($url) {
  $flag = false;
  $regexp = '|<span class="">(.+)</span>|';
  $songs = array();
  $contents = file($url);
  $date = NULL;
  $vanue = NULL;
  foreach($contents as $line) {
    /* extract date  */
    if (is_null($date) && preg_match(
         '|Dream Theater Concert at (.+) Setlist on (.+, 2009)</h1>|', 
         $line, $matches)) {
      $venue = html_entity_decode($matches[1]);
      $date = date("Y-m-d", strtotime($matches[2]));
    }
    /* extract songs */
    if (preg_match('|<ol>|', $line)) {
      $flag = True;
    }
    if ($flag) {
      if (preg_match($regexp, $line, $matches)) {
        $s = $matches[1];
        $s = strip_tags($s);
        if (count($songs) === 0) {
          $songs[1] = $s;
        } else {
          $songs[] = $s;
        }
      }
    }
    if (preg_match('|</ol>|', $line)) {
      $flag = False;
    }
  }
  $result = array(
    'date' => $date,
    'venue' => $venue,
    'songs' => $songs
  );
  return $result;
}

function generate_inserts($setlist, $table="setlist") {
  $results = array();
  extract($setlist);
  foreach($songs as $position=>$s) {
    $results[] = sprintf("INSERT INTO $table VALUES('', '%s', '%s', '%d','%s');", $date, $venue, $position, $s);
  }
  return $results;
}

$urls = get_setlists_2009();
foreach($urls as $u) {
  $set = grab_setlist($u);
  $inserts = generate_inserts($set);
  foreach($inserts as $sql) {
    printf("%s\n", $sql);
  }
}

?>