diff --git a/tools/bigdata/generate.php b/tools/bigdata/generate.php
index 2f20012..11a5d12 100644
--- a/tools/bigdata/generate.php
+++ b/tools/bigdata/generate.php
@@ -1,5 +1,35 @@
\s+JSON.*?<\/a>/is', $data, $m);
- foreach ($m[1] as $url) {
- $file = str_replace(['/', '.'], '_', $url); // fix file name
- if (!is_cached($file . '.json')) {
- echo 'Downloading: ' . 'https://checksums.kubik-rubik.de/' . $url . PHP_EOL;
- $json = file_get_contents('https://checksums.kubik-rubik.de/' . $url);
- set_cache($file . '.json', $json);
- } else {
- $json = get_cache($file . '.json');
- }
-
- // decode json, write hash values to one file
- echo 'Parsing json file: ' . $file . PHP_EOL;
- foreach (json_decode($json) as $k => $hash) {
- fputs($fp, $hash . "\n");
+ $hash_file = $cache_dir . '/' . $file . '.hash';
+ if (!is_file($hash_file)) {
+ $f = fopen($hash_file, 'w');
+ $fh = new RecursiveIteratorIterator(
+ new RecursiveDirectoryIterator('phar://' . $cache_dir . '/' . $file),
+ RecursiveIteratorIterator::CHILD_FIRST
+ );
+ foreach ($fh as $splFileInfo) {
+ if ($splFileInfo->isFile()) {
+ // store md5 hash we use that in the scanner
+ fputs($f, md5(file_get_contents($splFileInfo->getPathname())) . "\n");
}
}
+ fclose($f);
}
+
+ fputs($fp, file_get_contents($hash_file));
}
function fetch_jquery($fp)
{
echo 'Fetching jQuery' . PHP_EOL;
- $data = file_get_contents('https://code.jquery.com/jquery/');
+ $data = fetch('https://code.jquery.com/jquery/');
preg_match_all(
'/ $file) {
if (!is_cached($file)) {
echo 'Downloading: ' . 'https://code.jquery.com/' . $file . PHP_EOL;
- $data = file_get_contents('https://code.jquery.com/' . $file);
+ $data = fetch('https://code.jquery.com/' . $file);
if (base64_encode(hash('sha256', $data, true)) != $m[2][$k]) {
die('Hash mismatch' . PHP_EOL);
}
@@ -72,10 +98,164 @@ function fetch_jquery($fp)
$data = get_cache($file);
}
+ // store md5 hash we use that in the scanner
fputs($fp, md5($data) . "\n");
}
}
+function fetch_archive($file, $url, $hash, $algo, $hash_url = null)
+{
+ $tmp = __DIR__ . 'dl.tar.gz';
+ if (!is_cached($file)) {
+ echo 'Downloading: ' . $url . PHP_EOL;
+ fetch($url, $tmp);
+ if (!empty($hash_url)) {
+ echo 'Downloading hash: ' . $hash_url . PHP_EOL;
+ $hash = fetch($hash_url);
+ }
+ $data_hash = hash_file($algo, $tmp);
+ if ($data_hash != $hash) {
+ die('Hash mismatch: ' . $data_hash . ' != ' . $hash . PHP_EOL);
+ }
+ set_cache($file, file_get_contents($tmp), $algo, $hash);
+ }
+}
+
+// Ignored releases are: beta, RC, strayhorn, mingus, delta, gold and mu by regexp and 1.0.2 because no sha1
+function fetch_wordpress($fp)
+{
+ echo 'Fetching Wordpress' . PHP_EOL;
+ $data = fetch('https://wordpress.org/download/releases/');
+
+ preg_match_all(
+ '//',
+ $data,
+ $m
+ );
+ foreach ($m[2] as $k => $file) {
+ if ($m[2][$k] == 'wordpress-1.0.2.tar.gz') {
+ // no sha1 info
+ continue;
+ }
+ fetch_archive($m[2][$k], $m[1][$k], null, 'sha1', $m[1][$k] . '.sha1');
+ hash_archive($fp, $file);
+ }
+}
+
+// Ignores: snapshots, rc, beta, alpha
+function fetch_typo3($fp)
+{
+ echo 'Fetching Typo3' . PHP_EOL;
+ $data = json_decode(fetch('https://get.typo3.org/json'));
+ foreach ($data as $value) {
+ if (isset($value->releases)) {
+ foreach ($value->releases as $release) {
+ if (strstr($release->version, 'snapshot') || strstr($release->version, 'rc') || strstr($release->version, 'beta') || strstr($release->version, 'alpha')) {
+ // ignoring snapshots
+ continue;
+ }
+ if (in_array($release->version, ['4.6.0', '4.5.33', '3.3.0'])) {
+ // The specified blob does not exist.
+ // 3.3.0 is damaged archive
+ continue;
+ }
+ $file = 'type3-' . $release->version . '.tar.gz';
+ fetch_archive($file, $release->url->tar, $release->checksums->tar->sha1, 'sha1');
+ hash_archive($fp, $file);
+ }
+ }
+ }
+}
+
+function fetch_pagekit($fp)
+{
+ echo 'Fetching Pagekit' . PHP_EOL;
+ $data = json_decode(fetch('https://pagekit.com/api/update'));
+ foreach ($data as $k => $releases) {
+ if ($k == 'latest') {
+ $releases = [$releases];
+ }
+ foreach ($releases as $release) {
+ $file = 'pagekit-' . $release->version . '.tar.gz';
+ fetch_archive($file, $release->url, $release->shasum, 'sha1');
+ hash_archive($fp, $file);
+ }
+ }
+}
+
+// Ignored releases are: alpha, beta, rc, dev
+function fetch_drupal($fp, $versions)
+{
+ foreach ($versions as $version => $id) {
+ echo 'Fetching Drupal ' . $version . PHP_EOL;
+
+ $page = 0;
+ $pages = false;
+ do {
+ $data = fetch('https://www.drupal.org/project/drupal/releases?api_version%5B%5D=' . $id . '&page=' .$page);
+
+ // pagination init
+ if ($pages === false && preg_match('/&page=(\d+)">last ยป<\/a>/', $data, $m)) {
+ $pages = $m[1];
+ }
+
+ preg_match_all(
+ '/data-th="Download">(.*?)\s*([a-z0-9]{32})\s*<\/td>/is',
+ $data,
+ $m
+ );
+ foreach ($m[3] as $k => $file) {
+ fetch_archive($file, $m[2][$k], $m[5][$k], 'md5');
+ hash_archive($fp, $file);
+ }
+
+ if ($pages === false) {
+ break;
+ }
+
+ $page++;
+ }while($page <= $pages);
+ }
+}
+
+function fetch_joomla($fp, $versions)
+{
+ foreach ($versions as $version => $id) {
+ echo 'Fetching Joomla ' . $version . PHP_EOL;
+
+ $data = fetch('https://downloads.joomla.org/cms/joomla' . $id);
+ preg_match_all('/href="(\/cms\/joomla\d+\/(\d+\-\d+\-\d+))"/', $data, $m);
+ foreach ($m[1] as $k => $url) {
+ $file = 'joomla_' . $m[2][$k] . '-stable-full_package.tar.gz';
+
+ // pre check because we need hash information
+ if (!is_cached($file)) {
+ $data = fetch('https://downloads.joomla.org' . $url);
+
+ if (!preg_match('/Joomla! '.str_replace('-', '\.', $m[2][$k]).' Full Package \(\.tar\.gz\).*?SHA1 Signature\s*<\/dt>\s*\s*([a-z0-9]{40})\s*<\/dd>/is', $data, $m2)) {
+ echo 'Unable to find SHA1 signature for version ' . $m[2][$k] . PHP_EOL;
+ break;
+ }
+
+ if (!preg_match('/href="('.preg_quote($url, '/').'\/.*?format=gz)"/', $data, $m3)) {
+ echo 'Unable to find archive url for version ' . $m[2][$k] . PHP_EOL;
+ break;
+ }
+
+ fetch_archive($file, 'https://downloads.joomla.org' . $m3[1], $m2[1], 'sha1');
+ }
+
+ hash_archive($fp, $file);
+ }
+ }
+}
+
+if ($argc == 2) {
+ $cache_dir = $argv[1];
+} else {
+ $cache_dir = __DIR__ . '/cache';
+}
+
if (!is_readable($cache_dir)) {
if (!mkdir($cache_dir)) {
die('Unable to create cache directory');
@@ -84,8 +264,32 @@ if (!is_readable($cache_dir)) {
$fp = fopen('all.txt', 'w');
-// fetch_kubik_rubik($fp);
+// TODO https://modx.com/download/other-downloads
+// TODO wordpress plugins only popular ones
+
fetch_jquery($fp);
+fetch_wordpress($fp);
+fetch_typo3($fp);
+fetch_pagekit($fp);
+fetch_drupal(
+ $fp,
+ [
+ '9.x' => 39794,
+ '8.x' => 7234,
+ '7.x' => 103,
+ '6.x' => 87,
+ '5.x' => 78,
+ '4.7.x' => 79,
+ '4.6.x' => 80,
+ '4.5.x' => 81,
+ '4.4.x' => 82,
+ '4.3.x' => 83,
+ '4.2.x' => 84,
+ '4.1.x' => 85,
+ '4.0.x' => 86
+ ]
+);
+fetch_joomla($fp, ['3.0' => 3, '2.5' => 25, '1.5' => 15, '1.0' => 10]);
fclose($fp);