diff --git a/scan.php b/scan.php index bc82b0d..35f4bd3 100644 --- a/scan.php +++ b/scan.php @@ -39,6 +39,7 @@ class MalwareScanner private $flagFollowSymlink = false; private $flagLineNumber = false; private $flagScanEverything = false; + private $flagCombinedWhitelist = false; private $outputFormat = ''; private $whitelist = array(); private $ignore = array(); @@ -54,6 +55,8 @@ class MalwareScanner private $patterns_re = array(); private $patterns_b64functions = array(); private $patterns_b64keywords = array(); + private $combined_whitelist = array(); + private $combined_whitelist_count = 0; /** * MalwareScanner constructor. @@ -124,6 +127,11 @@ class MalwareScanner //Check if the md5 checksum exists in the whitelist and returns true if it does. private function inWhitelist($hash) { + if ($this->flagCombinedWhitelist) { + if ($this->binarySearch($hash, $this->combined_whitelist, $this->combined_whitelist_count) > -1) { + return true; + } + } return in_array($hash, $this->whitelist); } @@ -221,7 +229,8 @@ class MalwareScanner 'line-number', 'output-format:', 'wordpress-version:', - 'scan-everything' + 'scan-everything', + 'combined-whitelist' ) ); @@ -301,6 +310,9 @@ class MalwareScanner if (isset($options['scan-everything']) || isset($options['E'])) { $this->setFlagScanEverything(true); } + if (isset($options['combined-whitelist'])) { + $this->setFlagCombinedWhitelist(true); + } } public function setExtensions(array $a) @@ -384,6 +396,11 @@ class MalwareScanner $this->flagScanEverything = $b; } + public function setFlagCombinedWhitelist($b) + { + $this->flagCombinedWhitelist = $b; + } + // @see http://stackoverflow.com/a/13914119 private function pathMatches($path, $pattern, $ignoreCase = false) { @@ -558,24 +575,33 @@ class MalwareScanner echo 'Total malware identified: ' . $this->stat['files_infected'] . PHP_EOL; } - //Validates the input directory - //Calls the load pattern and load whitelist functions - //Calls the process and report functions. + /** + * Validates the input directory + * + * - Calls the load pattern and load whitelist functions + * - Fetch and load combined whitelist + * - Calls the process and report functions. + * + * @param $dir + * @return bool + */ public function run($dir) { - //Make sure the input is a valid directory path. + // Make sure the input is a valid directory path. $dir = rtrim($dir, '/'); if (!is_dir($dir)) { $this->error('Specified path is not a directory: ' . $dir); return false; } - //Load Patterns $this->initializePatterns(); - //Load Whitelist $this->loadWhitelist(); + if ($this->flagCombinedWhitelist && !$this->updateCombinedWhitelist()) { + return false; + } + $start = time(); $this->process($dir . '/'); $this->report($start, $dir . '/'); @@ -671,6 +697,79 @@ class MalwareScanner } } + // @see https://www.mkwd.net/binary-search-algorithm-in-php/ + private function binarySearch($needle, array $haystack, $high, $low = 0) + { + $key = false; + // Whilst we have a range. If not, then that match was not found. + while ($high >= $low) { + // Find the middle of the range. + $mid = (int)floor(($high + $low) / 2); + // Compare the middle of the range with the needle. This should return <0 if it's in the first part of the range, + // or >0 if it's in the second part of the range. It will return 0 if there is a match. + $cmp = strcmp($needle, $haystack[$mid]); + // Adjust the range based on the above logic, so the next loop iteration will use the narrowed range + if ($cmp < 0) { + $high = $mid - 1; + } elseif ($cmp > 0) { + $low = $mid + 1; + } else { + $key = $mid; + break; + } + } + + return $key; + } + + private function updateCombinedWhitelist($url = 'https://scr34m.github.io/php-malware-scanner') + { + $latest_hash = trim(file_get_contents($url . '/database/compressed.sha256')); + if ($latest_hash === false) { + $this->error('Unable to download database checksum'); + return false; + } + + $file = __DIR__ . '/whitelist.dat'; + if (is_readable($file)) { + $hash = hash_file('sha256', $file); + if ($hash != $latest_hash) { + $download = true; + } else { + $download = false; + } + } else { + $download = true; + } + + if ($download) { + $data = file_get_contents($url . '/database/compressed.dat'); + if ($data === false) { + $this->error('Unable to download database'); + return false; + } + + file_put_contents($file, $data); + $hash = hash_file('sha256', $file); + if ($hash != $latest_hash) { + $this->error('Downloaded database hash mismatch'); + } + } + + $content = gzdecode(file_get_contents($file)); + $this->combined_whitelist = []; + $this->combined_whitelist_count = 0; + foreach (explode("\n", $content) as $line) { // faster than strtok, but needs more memory + if ($line) { + $this->combined_whitelist[] = $line; + $this->combined_whitelist_count++; + } + } + $this->combined_whitelist_count -= 1; // -1 because we use indexes in binary search + echo 'Combined whitelist records count: ' . ($this->combined_whitelist_count + 1) . PHP_EOL; + return true; + } + //Prints out the usage menu options. private function showHelp() { @@ -695,6 +794,7 @@ class MalwareScanner echo ' -L --line-number Display matching pattern line number in file' . PHP_EOL; echo ' -o --output-format Custom defined output format' . PHP_EOL; echo ' -j --wordpress-version Version of wordpress to get md5 signatures' . PHP_EOL; + echo ' --combined-whitelist Combined whitelist' . PHP_EOL; } diff --git a/tools/bigdata/generate.php b/tools/bigdata/generate.php new file mode 100644 index 0000000..81386ce --- /dev/null +++ b/tools/bigdata/generate.php @@ -0,0 +1,92 @@ +\s+JSON.*?<\/a>/is', $data, $m); + foreach ($m[1] as $url) { + $file = str_replace(['/', '.'], '_', $url); // fix file name + if (!is_cached($file . '.json')) { + echo 'Downloading: ' . 'https://checksums.kubik-rubik.de/' . $url . PHP_EOL; + $json = file_get_contents('https://checksums.kubik-rubik.de/' . $url); + set_cache($file . '.json', $json); + } else { + $json = get_cache($file . '.json'); + } + + // decode json, write hash values to one file + echo 'Parsing json file: ' . $file . PHP_EOL; + foreach (json_decode($json) as $k => $hash) { + fputs($fp, $hash . "\n"); + } + } + } +} + +function fetch_jquery($fp) +{ + echo 'Fetching jQuery' . PHP_EOL; + $data = file_get_contents('https://code.jquery.com/jquery/'); + + preg_match_all('/ compressed.dat'); // gzencode + +$hash = hash_file('sha256', 'compressed.dat'); +file_put_contents('compressed.sha256', $hash); +echo 'SHA256 is ' . $hash . PHP_EOL; \ No newline at end of file diff --git a/tools/jquery.py b/tools/jquery.py deleted file mode 100644 index 2b00bd5..0000000 --- a/tools/jquery.py +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env python - -import urllib2 -import re -import hashlib - -def fetch(url): - response = urllib2.urlopen(url) - return response.read() - -def main(): - html = fetch('https://code.jquery.com/jquery/') - regex = re.compile(r"