From 0546396d015d9e1b4955801b7a452a1cd081306b Mon Sep 17 00:00:00 2001 From: Gabor Gyorvari Date: Fri, 22 Feb 2019 11:50:15 +0100 Subject: [PATCH] Combined whitelist sources updated --- tools/bigdata/generate.php | 252 +++++++++++++++++++++++++++++++++---- 1 file changed, 228 insertions(+), 24 deletions(-) diff --git a/tools/bigdata/generate.php b/tools/bigdata/generate.php index 2f20012..11a5d12 100644 --- a/tools/bigdata/generate.php +++ b/tools/bigdata/generate.php @@ -1,5 +1,35 @@ \s+JSON.*?<\/a>/is', $data, $m); - foreach ($m[1] as $url) { - $file = str_replace(['/', '.'], '_', $url); // fix file name - if (!is_cached($file . '.json')) { - echo 'Downloading: ' . 'https://checksums.kubik-rubik.de/' . $url . PHP_EOL; - $json = file_get_contents('https://checksums.kubik-rubik.de/' . $url); - set_cache($file . '.json', $json); - } else { - $json = get_cache($file . '.json'); - } - - // decode json, write hash values to one file - echo 'Parsing json file: ' . $file . PHP_EOL; - foreach (json_decode($json) as $k => $hash) { - fputs($fp, $hash . "\n"); + $hash_file = $cache_dir . '/' . $file . '.hash'; + if (!is_file($hash_file)) { + $f = fopen($hash_file, 'w'); + $fh = new RecursiveIteratorIterator( + new RecursiveDirectoryIterator('phar://' . $cache_dir . '/' . $file), + RecursiveIteratorIterator::CHILD_FIRST + ); + foreach ($fh as $splFileInfo) { + if ($splFileInfo->isFile()) { + // store md5 hash we use that in the scanner + fputs($f, md5(file_get_contents($splFileInfo->getPathname())) . "\n"); } } + fclose($f); } + + fputs($fp, file_get_contents($hash_file)); } function fetch_jquery($fp) { echo 'Fetching jQuery' . PHP_EOL; - $data = file_get_contents('https://code.jquery.com/jquery/'); + $data = fetch('https://code.jquery.com/jquery/'); preg_match_all( '/ $file) { if (!is_cached($file)) { echo 'Downloading: ' . 'https://code.jquery.com/' . $file . PHP_EOL; - $data = file_get_contents('https://code.jquery.com/' . $file); + $data = fetch('https://code.jquery.com/' . $file); if (base64_encode(hash('sha256', $data, true)) != $m[2][$k]) { die('Hash mismatch' . PHP_EOL); } @@ -72,10 +98,164 @@ function fetch_jquery($fp) $data = get_cache($file); } + // store md5 hash we use that in the scanner fputs($fp, md5($data) . "\n"); } } +function fetch_archive($file, $url, $hash, $algo, $hash_url = null) +{ + $tmp = __DIR__ . 'dl.tar.gz'; + if (!is_cached($file)) { + echo 'Downloading: ' . $url . PHP_EOL; + fetch($url, $tmp); + if (!empty($hash_url)) { + echo 'Downloading hash: ' . $hash_url . PHP_EOL; + $hash = fetch($hash_url); + } + $data_hash = hash_file($algo, $tmp); + if ($data_hash != $hash) { + die('Hash mismatch: ' . $data_hash . ' != ' . $hash . PHP_EOL); + } + set_cache($file, file_get_contents($tmp), $algo, $hash); + } +} + +// Ignored releases are: beta, RC, strayhorn, mingus, delta, gold and mu by regexp and 1.0.2 because no sha1 +function fetch_wordpress($fp) +{ + echo 'Fetching Wordpress' . PHP_EOL; + $data = fetch('https://wordpress.org/download/releases/'); + + preg_match_all( + '//', + $data, + $m + ); + foreach ($m[2] as $k => $file) { + if ($m[2][$k] == 'wordpress-1.0.2.tar.gz') { + // no sha1 info + continue; + } + fetch_archive($m[2][$k], $m[1][$k], null, 'sha1', $m[1][$k] . '.sha1'); + hash_archive($fp, $file); + } +} + +// Ignores: snapshots, rc, beta, alpha +function fetch_typo3($fp) +{ + echo 'Fetching Typo3' . PHP_EOL; + $data = json_decode(fetch('https://get.typo3.org/json')); + foreach ($data as $value) { + if (isset($value->releases)) { + foreach ($value->releases as $release) { + if (strstr($release->version, 'snapshot') || strstr($release->version, 'rc') || strstr($release->version, 'beta') || strstr($release->version, 'alpha')) { + // ignoring snapshots + continue; + } + if (in_array($release->version, ['4.6.0', '4.5.33', '3.3.0'])) { + // The specified blob does not exist. + // 3.3.0 is damaged archive + continue; + } + $file = 'type3-' . $release->version . '.tar.gz'; + fetch_archive($file, $release->url->tar, $release->checksums->tar->sha1, 'sha1'); + hash_archive($fp, $file); + } + } + } +} + +function fetch_pagekit($fp) +{ + echo 'Fetching Pagekit' . PHP_EOL; + $data = json_decode(fetch('https://pagekit.com/api/update')); + foreach ($data as $k => $releases) { + if ($k == 'latest') { + $releases = [$releases]; + } + foreach ($releases as $release) { + $file = 'pagekit-' . $release->version . '.tar.gz'; + fetch_archive($file, $release->url, $release->shasum, 'sha1'); + hash_archive($fp, $file); + } + } +} + +// Ignored releases are: alpha, beta, rc, dev +function fetch_drupal($fp, $versions) +{ + foreach ($versions as $version => $id) { + echo 'Fetching Drupal ' . $version . PHP_EOL; + + $page = 0; + $pages = false; + do { + $data = fetch('https://www.drupal.org/project/drupal/releases?api_version%5B%5D=' . $id . '&page=' .$page); + + // pagination init + if ($pages === false && preg_match('/&page=(\d+)">last ยป<\/a>/', $data, $m)) { + $pages = $m[1]; + } + + preg_match_all( + '/data-th="Download">(.*?)\s*([a-z0-9]{32})\s*<\/td>/is', + $data, + $m + ); + foreach ($m[3] as $k => $file) { + fetch_archive($file, $m[2][$k], $m[5][$k], 'md5'); + hash_archive($fp, $file); + } + + if ($pages === false) { + break; + } + + $page++; + }while($page <= $pages); + } +} + +function fetch_joomla($fp, $versions) +{ + foreach ($versions as $version => $id) { + echo 'Fetching Joomla ' . $version . PHP_EOL; + + $data = fetch('https://downloads.joomla.org/cms/joomla' . $id); + preg_match_all('/href="(\/cms\/joomla\d+\/(\d+\-\d+\-\d+))"/', $data, $m); + foreach ($m[1] as $k => $url) { + $file = 'joomla_' . $m[2][$k] . '-stable-full_package.tar.gz'; + + // pre check because we need hash information + if (!is_cached($file)) { + $data = fetch('https://downloads.joomla.org' . $url); + + if (!preg_match('/Joomla! '.str_replace('-', '\.', $m[2][$k]).' Full Package \(\.tar\.gz\).*?SHA1 Signature\s*<\/dt>\s*
\s*([a-z0-9]{40})\s*<\/dd>/is', $data, $m2)) { + echo 'Unable to find SHA1 signature for version ' . $m[2][$k] . PHP_EOL; + break; + } + + if (!preg_match('/href="('.preg_quote($url, '/').'\/.*?format=gz)"/', $data, $m3)) { + echo 'Unable to find archive url for version ' . $m[2][$k] . PHP_EOL; + break; + } + + fetch_archive($file, 'https://downloads.joomla.org' . $m3[1], $m2[1], 'sha1'); + } + + hash_archive($fp, $file); + } + } +} + +if ($argc == 2) { + $cache_dir = $argv[1]; +} else { + $cache_dir = __DIR__ . '/cache'; +} + if (!is_readable($cache_dir)) { if (!mkdir($cache_dir)) { die('Unable to create cache directory'); @@ -84,8 +264,32 @@ if (!is_readable($cache_dir)) { $fp = fopen('all.txt', 'w'); -// fetch_kubik_rubik($fp); +// TODO https://modx.com/download/other-downloads +// TODO wordpress plugins only popular ones + fetch_jquery($fp); +fetch_wordpress($fp); +fetch_typo3($fp); +fetch_pagekit($fp); +fetch_drupal( + $fp, + [ + '9.x' => 39794, + '8.x' => 7234, + '7.x' => 103, + '6.x' => 87, + '5.x' => 78, + '4.7.x' => 79, + '4.6.x' => 80, + '4.5.x' => 81, + '4.4.x' => 82, + '4.3.x' => 83, + '4.2.x' => 84, + '4.1.x' => 85, + '4.0.x' => 86 + ] +); +fetch_joomla($fp, ['3.0' => 3, '2.5' => 25, '1.5' => 15, '1.0' => 10]); fclose($fp);