From 5cf90cd3718ae9e56f373ac8dec18260ed609c12 Mon Sep 17 00:00:00 2001 From: Gabor Gyorvari Date: Mon, 10 Dec 2018 13:02:03 +0100 Subject: [PATCH] Working with pre generated big hash database for whitelisting --- scan.php | 115 ++++++++++++++++++++++++++++++++++--- tools/bigdata/generate.php | 41 +++++++++++++ 2 files changed, 149 insertions(+), 7 deletions(-) create mode 100644 tools/bigdata/generate.php diff --git a/scan.php b/scan.php index 14e2144..79b27e0 100644 --- a/scan.php +++ b/scan.php @@ -39,6 +39,7 @@ class MalwareScanner private $flagFollowSymlink = false; private $flagLineNumber = false; private $flagScanEverything = false; + private $flagBigData = false; private $outputFormat = ''; private $whitelist = array(); private $ignore = array(); @@ -54,6 +55,8 @@ class MalwareScanner private $patterns_re = array(); private $patterns_b64functions = array(); private $patterns_b64keywords = array(); + private $bigdata = array(); + private $bigdata_count = 0; /** * MalwareScanner constructor. @@ -124,6 +127,11 @@ class MalwareScanner //Check if the md5 checksum exists in the whitelist and returns true if it does. private function inWhitelist($hash) { + if ($this->flagBigData) { + if ($this->binarySearch($hash, $this->bigdata, $this->bigdata_count) > -1) { + return true; + } + } return in_array($hash, $this->whitelist); } @@ -221,7 +229,8 @@ class MalwareScanner 'line-number', 'output-format:', 'wordpress-version:', - 'scan-everything' + 'scan-everything', + 'big-data' ) ); @@ -301,6 +310,9 @@ class MalwareScanner if (isset($options['scan-everything']) || isset($options['E'])) { $this->setFlagScanEverything(true); } + if (isset($options['big-data'])) { + $this->setFlagBigData(true); + } } public function setExtensions(array $a) @@ -384,6 +396,11 @@ class MalwareScanner $this->flagScanEverything = $b; } + public function setFlagBigData($b) + { + $this->flagBigData = $b; + } + // @see http://stackoverflow.com/a/13914119 private function pathMatches($path, $pattern, $ignoreCase = false) { @@ -557,24 +574,33 @@ class MalwareScanner echo 'Total malware identified: ' . $this->stat['files_infected'] . PHP_EOL; } - //Validates the input directory - //Calls the load pattern and load whitelist functions - //Calls the process and report functions. + /** + * Validates the input directory + * + * - Calls the load pattern and load whitelist functions + * - Fetch and load big data white list + * - Calls the process and report functions. + * + * @param $dir + * @return bool + */ public function run($dir) { - //Make sure the input is a valid directory path. + // Make sure the input is a valid directory path. $dir = rtrim($dir, '/'); if (!is_dir($dir)) { $this->error('Specified path is not a directory: ' . $dir); return false; } - //Load Patterns $this->initializePatterns(); - //Load Whitelist $this->loadWhitelist(); + if ($this->flagBigData && !$this->updateBigData()) { + return false; + } + $start = time(); $this->process($dir . '/'); $this->report($start, $dir . '/'); @@ -674,6 +700,80 @@ class MalwareScanner } } + // @see https://www.mkwd.net/binary-search-algorithm-in-php/ + private function binarySearch($needle, array $haystack, $high, $low = 0) + { + $key = false; + // Whilst we have a range. If not, then that match was not found. + while ($high >= $low) { + // Find the middle of the range. + $mid = (int)floor(($high + $low) / 2); + // Compare the middle of the range with the needle. This should return <0 if it's in the first part of the range, + // or >0 if it's in the second part of the range. It will return 0 if there is a match. + $cmp = strcmp($needle, $haystack[$mid]); + // Adjust the range based on the above logic, so the next loop iteration will use the narrowed range + if ($cmp < 0) { + $high = $mid - 1; + } elseif ($cmp > 0) { + $low = $mid + 1; + } else { + $key = $mid; + break; + } + } + + return $key; + } + + private function updateBigData() + { + $url = 'http://127.0.0.1:4000'; + $latest_hash = trim(file_get_contents($url . '/database/compressed.sha256')); + if ($latest_hash === false) { + $this->error('Unable to download database checksum'); + return false; + } + + $file = __DIR__ . '/bigdata.dat'; + if (is_readable($file)) { + $hash = hash_file('sha256', $file); + if ($hash != $latest_hash) { + $download = true; + } else { + $download = false; + } + } else { + $download = true; + } + + if ($download) { + $data = file_get_contents($url . '/database/compressed.dat'); + if ($data === false) { + $this->error('Unable to download database'); + return false; + } + + file_put_contents($file, $data); + $hash = hash_file('sha256', $file); + if ($hash != $latest_hash) { + $this->error('Downloaded database hash mismatch'); + } + } + + $content = gzdecode(file_get_contents($file)); + $this->bigdata = []; + $this->bigdata_count = 0; + foreach (explode("\n", $content) as $line) { // faster than strtok, but needs more memory + if ($line) { + $this->bigdata[] = $line; + $this->bigdata_count++; + } + } + $this->bigdata_count -= 1; // -1 because we use indexes in binary search + echo 'Big data loaded hash count: ' . ($this->bigdata_count + 1) . PHP_EOL; + return true; + } + //Prints out the usage menu options. private function showHelp() { @@ -698,6 +798,7 @@ class MalwareScanner echo ' -L --line-number Display matching pattern line number in file' . PHP_EOL; echo ' -o --output-format Custom defined output format' . PHP_EOL; echo ' -j --wordpress-version Version of wordpress to get md5 signatures' . PHP_EOL; + echo ' --big-data General whitelist' . PHP_EOL; } diff --git a/tools/bigdata/generate.php b/tools/bigdata/generate.php new file mode 100644 index 0000000..d9cd239 --- /dev/null +++ b/tools/bigdata/generate.php @@ -0,0 +1,41 @@ +\s+JSON.*?<\/a>/is', $data, $m); + foreach ($m[1] as $url) { + $file = str_replace(['/', '.'], '_', $url); // fix file name + if (!is_readable(__DIR__ . '/cache/' . $file . '.json')) { + echo 'Downloading: ' . 'https://checksums.kubik-rubik.de/' . $url . PHP_EOL; + $json = file_get_contents('https://checksums.kubik-rubik.de/' . $url); + file_put_contents(__DIR__ . '/cache/' . $file . '.json', $json); + } else { + $json = file_get_contents(__DIR__ . '/cache/' . $file . '.json'); + } + + // decode json, write hash values to one file + echo 'Parsing json file: ' . $file . PHP_EOL; + foreach (json_decode($json) as $k => $hash) { + fputs($fp, $hash . "\n"); + } + } +} + +fclose($fp); + +echo 'Create unique database' . PHP_EOL; +exec('sort -u -o unique.txt all.txt'); + +echo 'Compressing all.txt' . PHP_EOL; +exec('gzip < unique.txt > compressed.dat'); // gzencode \ No newline at end of file