From 5cf90cd3718ae9e56f373ac8dec18260ed609c12 Mon Sep 17 00:00:00 2001 From: Gabor Gyorvari Date: Mon, 10 Dec 2018 13:02:03 +0100 Subject: [PATCH 1/4] Working with pre generated big hash database for whitelisting --- scan.php | 115 ++++++++++++++++++++++++++++++++++--- tools/bigdata/generate.php | 41 +++++++++++++ 2 files changed, 149 insertions(+), 7 deletions(-) create mode 100644 tools/bigdata/generate.php diff --git a/scan.php b/scan.php index 14e2144..79b27e0 100644 --- a/scan.php +++ b/scan.php @@ -39,6 +39,7 @@ class MalwareScanner private $flagFollowSymlink = false; private $flagLineNumber = false; private $flagScanEverything = false; + private $flagBigData = false; private $outputFormat = ''; private $whitelist = array(); private $ignore = array(); @@ -54,6 +55,8 @@ class MalwareScanner private $patterns_re = array(); private $patterns_b64functions = array(); private $patterns_b64keywords = array(); + private $bigdata = array(); + private $bigdata_count = 0; /** * MalwareScanner constructor. @@ -124,6 +127,11 @@ class MalwareScanner //Check if the md5 checksum exists in the whitelist and returns true if it does. private function inWhitelist($hash) { + if ($this->flagBigData) { + if ($this->binarySearch($hash, $this->bigdata, $this->bigdata_count) > -1) { + return true; + } + } return in_array($hash, $this->whitelist); } @@ -221,7 +229,8 @@ class MalwareScanner 'line-number', 'output-format:', 'wordpress-version:', - 'scan-everything' + 'scan-everything', + 'big-data' ) ); @@ -301,6 +310,9 @@ class MalwareScanner if (isset($options['scan-everything']) || isset($options['E'])) { $this->setFlagScanEverything(true); } + if (isset($options['big-data'])) { + $this->setFlagBigData(true); + } } public function setExtensions(array $a) @@ -384,6 +396,11 @@ class MalwareScanner $this->flagScanEverything = $b; } + public function setFlagBigData($b) + { + $this->flagBigData = $b; + } + // @see http://stackoverflow.com/a/13914119 private function pathMatches($path, $pattern, $ignoreCase = false) { @@ -557,24 +574,33 @@ class MalwareScanner echo 'Total malware identified: ' . $this->stat['files_infected'] . PHP_EOL; } - //Validates the input directory - //Calls the load pattern and load whitelist functions - //Calls the process and report functions. + /** + * Validates the input directory + * + * - Calls the load pattern and load whitelist functions + * - Fetch and load big data white list + * - Calls the process and report functions. + * + * @param $dir + * @return bool + */ public function run($dir) { - //Make sure the input is a valid directory path. + // Make sure the input is a valid directory path. $dir = rtrim($dir, '/'); if (!is_dir($dir)) { $this->error('Specified path is not a directory: ' . $dir); return false; } - //Load Patterns $this->initializePatterns(); - //Load Whitelist $this->loadWhitelist(); + if ($this->flagBigData && !$this->updateBigData()) { + return false; + } + $start = time(); $this->process($dir . '/'); $this->report($start, $dir . '/'); @@ -674,6 +700,80 @@ class MalwareScanner } } + // @see https://www.mkwd.net/binary-search-algorithm-in-php/ + private function binarySearch($needle, array $haystack, $high, $low = 0) + { + $key = false; + // Whilst we have a range. If not, then that match was not found. + while ($high >= $low) { + // Find the middle of the range. + $mid = (int)floor(($high + $low) / 2); + // Compare the middle of the range with the needle. This should return <0 if it's in the first part of the range, + // or >0 if it's in the second part of the range. It will return 0 if there is a match. + $cmp = strcmp($needle, $haystack[$mid]); + // Adjust the range based on the above logic, so the next loop iteration will use the narrowed range + if ($cmp < 0) { + $high = $mid - 1; + } elseif ($cmp > 0) { + $low = $mid + 1; + } else { + $key = $mid; + break; + } + } + + return $key; + } + + private function updateBigData() + { + $url = 'http://127.0.0.1:4000'; + $latest_hash = trim(file_get_contents($url . '/database/compressed.sha256')); + if ($latest_hash === false) { + $this->error('Unable to download database checksum'); + return false; + } + + $file = __DIR__ . '/bigdata.dat'; + if (is_readable($file)) { + $hash = hash_file('sha256', $file); + if ($hash != $latest_hash) { + $download = true; + } else { + $download = false; + } + } else { + $download = true; + } + + if ($download) { + $data = file_get_contents($url . '/database/compressed.dat'); + if ($data === false) { + $this->error('Unable to download database'); + return false; + } + + file_put_contents($file, $data); + $hash = hash_file('sha256', $file); + if ($hash != $latest_hash) { + $this->error('Downloaded database hash mismatch'); + } + } + + $content = gzdecode(file_get_contents($file)); + $this->bigdata = []; + $this->bigdata_count = 0; + foreach (explode("\n", $content) as $line) { // faster than strtok, but needs more memory + if ($line) { + $this->bigdata[] = $line; + $this->bigdata_count++; + } + } + $this->bigdata_count -= 1; // -1 because we use indexes in binary search + echo 'Big data loaded hash count: ' . ($this->bigdata_count + 1) . PHP_EOL; + return true; + } + //Prints out the usage menu options. private function showHelp() { @@ -698,6 +798,7 @@ class MalwareScanner echo ' -L --line-number Display matching pattern line number in file' . PHP_EOL; echo ' -o --output-format Custom defined output format' . PHP_EOL; echo ' -j --wordpress-version Version of wordpress to get md5 signatures' . PHP_EOL; + echo ' --big-data General whitelist' . PHP_EOL; } diff --git a/tools/bigdata/generate.php b/tools/bigdata/generate.php new file mode 100644 index 0000000..d9cd239 --- /dev/null +++ b/tools/bigdata/generate.php @@ -0,0 +1,41 @@ +\s+JSON.*?<\/a>/is', $data, $m); + foreach ($m[1] as $url) { + $file = str_replace(['/', '.'], '_', $url); // fix file name + if (!is_readable(__DIR__ . '/cache/' . $file . '.json')) { + echo 'Downloading: ' . 'https://checksums.kubik-rubik.de/' . $url . PHP_EOL; + $json = file_get_contents('https://checksums.kubik-rubik.de/' . $url); + file_put_contents(__DIR__ . '/cache/' . $file . '.json', $json); + } else { + $json = file_get_contents(__DIR__ . '/cache/' . $file . '.json'); + } + + // decode json, write hash values to one file + echo 'Parsing json file: ' . $file . PHP_EOL; + foreach (json_decode($json) as $k => $hash) { + fputs($fp, $hash . "\n"); + } + } +} + +fclose($fp); + +echo 'Create unique database' . PHP_EOL; +exec('sort -u -o unique.txt all.txt'); + +echo 'Compressing all.txt' . PHP_EOL; +exec('gzip < unique.txt > compressed.dat'); // gzencode \ No newline at end of file From afc4cd4ef97805f9811575ab7645a62604e84862 Mon Sep 17 00:00:00 2001 From: Gabor Gyorvari Date: Mon, 10 Dec 2018 13:05:21 +0100 Subject: [PATCH 2/4] Update to live URL for big data --- scan.php | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scan.php b/scan.php index 79b27e0..e61e04a 100644 --- a/scan.php +++ b/scan.php @@ -725,9 +725,8 @@ class MalwareScanner return $key; } - private function updateBigData() + private function updateBigData($url = 'https://scr34m.github.io/php-malware-scanner') { - $url = 'http://127.0.0.1:4000'; $latest_hash = trim(file_get_contents($url . '/database/compressed.sha256')); if ($latest_hash === false) { $this->error('Unable to download database checksum'); From a0c59205e1d222421b9fd95412deaec860437327 Mon Sep 17 00:00:00 2001 From: Gabor Gyorvari Date: Tue, 11 Dec 2018 07:22:22 +0100 Subject: [PATCH 3/4] jQuery hashes added to pre generated list --- tools/bigdata/generate.php | 105 +++++++++++++++++++++++++++---------- tools/jquery.py | 22 -------- 2 files changed, 78 insertions(+), 49 deletions(-) delete mode 100644 tools/jquery.py diff --git a/tools/bigdata/generate.php b/tools/bigdata/generate.php index d9cd239..81386ce 100644 --- a/tools/bigdata/generate.php +++ b/tools/bigdata/generate.php @@ -1,41 +1,92 @@ \s+JSON.*?<\/a>/is', $data, $m); + foreach ($m[1] as $url) { + $file = str_replace(['/', '.'], '_', $url); // fix file name + if (!is_cached($file . '.json')) { + echo 'Downloading: ' . 'https://checksums.kubik-rubik.de/' . $url . PHP_EOL; + $json = file_get_contents('https://checksums.kubik-rubik.de/' . $url); + set_cache($file . '.json', $json); + } else { + $json = get_cache($file . '.json'); + } + + // decode json, write hash values to one file + echo 'Parsing json file: ' . $file . PHP_EOL; + foreach (json_decode($json) as $k => $hash) { + fputs($fp, $hash . "\n"); + } + } } } -echo 'Fetching checksums.kubik-rubik.de' . PHP_EOL; -$data = file_get_contents('https://checksums.kubik-rubik.de'); +function fetch_jquery($fp) +{ + echo 'Fetching jQuery' . PHP_EOL; + $data = file_get_contents('https://code.jquery.com/jquery/'); + + preg_match_all('/\s+JSON.*?<\/a>/is', $data, $m); - foreach ($m[1] as $url) { - $file = str_replace(['/', '.'], '_', $url); // fix file name - if (!is_readable(__DIR__ . '/cache/' . $file . '.json')) { - echo 'Downloading: ' . 'https://checksums.kubik-rubik.de/' . $url . PHP_EOL; - $json = file_get_contents('https://checksums.kubik-rubik.de/' . $url); - file_put_contents(__DIR__ . '/cache/' . $file . '.json', $json); - } else { - $json = file_get_contents(__DIR__ . '/cache/' . $file . '.json'); - } - - // decode json, write hash values to one file - echo 'Parsing json file: ' . $file . PHP_EOL; - foreach (json_decode($json) as $k => $hash) { - fputs($fp, $hash . "\n"); - } - } -} +fetch_kubik_rubik($fp); +fetch_jquery($fp); fclose($fp); -echo 'Create unique database' . PHP_EOL; +echo 'Creating unique database' . PHP_EOL; exec('sort -u -o unique.txt all.txt'); echo 'Compressing all.txt' . PHP_EOL; -exec('gzip < unique.txt > compressed.dat'); // gzencode \ No newline at end of file +exec('gzip < unique.txt > compressed.dat'); // gzencode + +$hash = hash_file('sha256', 'compressed.dat'); +file_put_contents('compressed.sha256', $hash); +echo 'SHA256 is ' . $hash . PHP_EOL; \ No newline at end of file diff --git a/tools/jquery.py b/tools/jquery.py deleted file mode 100644 index 2b00bd5..0000000 --- a/tools/jquery.py +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env python - -import urllib2 -import re -import hashlib - -def fetch(url): - response = urllib2.urlopen(url) - return response.read() - -def main(): - html = fetch('https://code.jquery.com/jquery/') - regex = re.compile(r" Date: Mon, 31 Dec 2018 11:10:31 +0100 Subject: [PATCH 4/4] Combined whitelist release --- scan.php | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/scan.php b/scan.php index e61e04a..d2cbc08 100644 --- a/scan.php +++ b/scan.php @@ -39,7 +39,7 @@ class MalwareScanner private $flagFollowSymlink = false; private $flagLineNumber = false; private $flagScanEverything = false; - private $flagBigData = false; + private $flagCombinedWhitelist = false; private $outputFormat = ''; private $whitelist = array(); private $ignore = array(); @@ -55,8 +55,8 @@ class MalwareScanner private $patterns_re = array(); private $patterns_b64functions = array(); private $patterns_b64keywords = array(); - private $bigdata = array(); - private $bigdata_count = 0; + private $combined_whitelist = array(); + private $combined_whitelist_count = 0; /** * MalwareScanner constructor. @@ -127,8 +127,8 @@ class MalwareScanner //Check if the md5 checksum exists in the whitelist and returns true if it does. private function inWhitelist($hash) { - if ($this->flagBigData) { - if ($this->binarySearch($hash, $this->bigdata, $this->bigdata_count) > -1) { + if ($this->flagCombinedWhitelist) { + if ($this->binarySearch($hash, $this->combined_whitelist, $this->combined_whitelist_count) > -1) { return true; } } @@ -230,7 +230,7 @@ class MalwareScanner 'output-format:', 'wordpress-version:', 'scan-everything', - 'big-data' + 'combined-whitelist' ) ); @@ -310,8 +310,8 @@ class MalwareScanner if (isset($options['scan-everything']) || isset($options['E'])) { $this->setFlagScanEverything(true); } - if (isset($options['big-data'])) { - $this->setFlagBigData(true); + if (isset($options['combined-whitelist'])) { + $this->setFlagCombinedWhitelist(true); } } @@ -396,9 +396,9 @@ class MalwareScanner $this->flagScanEverything = $b; } - public function setFlagBigData($b) + public function setFlagCombinedWhitelist($b) { - $this->flagBigData = $b; + $this->flagCombinedWhitelist = $b; } // @see http://stackoverflow.com/a/13914119 @@ -578,7 +578,7 @@ class MalwareScanner * Validates the input directory * * - Calls the load pattern and load whitelist functions - * - Fetch and load big data white list + * - Fetch and load combined whitelist * - Calls the process and report functions. * * @param $dir @@ -597,7 +597,7 @@ class MalwareScanner $this->loadWhitelist(); - if ($this->flagBigData && !$this->updateBigData()) { + if ($this->flagCombinedWhitelist && !$this->updateCombinedWhitelist()) { return false; } @@ -725,7 +725,7 @@ class MalwareScanner return $key; } - private function updateBigData($url = 'https://scr34m.github.io/php-malware-scanner') + private function updateCombinedWhitelist($url = 'https://scr34m.github.io/php-malware-scanner') { $latest_hash = trim(file_get_contents($url . '/database/compressed.sha256')); if ($latest_hash === false) { @@ -733,7 +733,7 @@ class MalwareScanner return false; } - $file = __DIR__ . '/bigdata.dat'; + $file = __DIR__ . '/whitelist.dat'; if (is_readable($file)) { $hash = hash_file('sha256', $file); if ($hash != $latest_hash) { @@ -760,16 +760,16 @@ class MalwareScanner } $content = gzdecode(file_get_contents($file)); - $this->bigdata = []; - $this->bigdata_count = 0; + $this->combined_whitelist = []; + $this->combined_whitelist_count = 0; foreach (explode("\n", $content) as $line) { // faster than strtok, but needs more memory if ($line) { - $this->bigdata[] = $line; - $this->bigdata_count++; + $this->combined_whitelist[] = $line; + $this->combined_whitelist_count++; } } - $this->bigdata_count -= 1; // -1 because we use indexes in binary search - echo 'Big data loaded hash count: ' . ($this->bigdata_count + 1) . PHP_EOL; + $this->combined_whitelist_count -= 1; // -1 because we use indexes in binary search + echo 'Combined whitelist records count: ' . ($this->combined_whitelist_count + 1) . PHP_EOL; return true; } @@ -797,7 +797,7 @@ class MalwareScanner echo ' -L --line-number Display matching pattern line number in file' . PHP_EOL; echo ' -o --output-format Custom defined output format' . PHP_EOL; echo ' -j --wordpress-version Version of wordpress to get md5 signatures' . PHP_EOL; - echo ' --big-data General whitelist' . PHP_EOL; + echo ' --combined-whitelist Combined whitelist' . PHP_EOL; }