diff --git a/scan.php b/scan.php
index bc82b0d..35f4bd3 100644
--- a/scan.php
+++ b/scan.php
@@ -39,6 +39,7 @@ class MalwareScanner
private $flagFollowSymlink = false;
private $flagLineNumber = false;
private $flagScanEverything = false;
+ private $flagCombinedWhitelist = false;
private $outputFormat = '';
private $whitelist = array();
private $ignore = array();
@@ -54,6 +55,8 @@ class MalwareScanner
private $patterns_re = array();
private $patterns_b64functions = array();
private $patterns_b64keywords = array();
+ private $combined_whitelist = array();
+ private $combined_whitelist_count = 0;
/**
* MalwareScanner constructor.
@@ -124,6 +127,11 @@ class MalwareScanner
//Check if the md5 checksum exists in the whitelist and returns true if it does.
private function inWhitelist($hash)
{
+ if ($this->flagCombinedWhitelist) {
+ if ($this->binarySearch($hash, $this->combined_whitelist, $this->combined_whitelist_count) > -1) {
+ return true;
+ }
+ }
return in_array($hash, $this->whitelist);
}
@@ -221,7 +229,8 @@ class MalwareScanner
'line-number',
'output-format:',
'wordpress-version:',
- 'scan-everything'
+ 'scan-everything',
+ 'combined-whitelist'
)
);
@@ -301,6 +310,9 @@ class MalwareScanner
if (isset($options['scan-everything']) || isset($options['E'])) {
$this->setFlagScanEverything(true);
}
+ if (isset($options['combined-whitelist'])) {
+ $this->setFlagCombinedWhitelist(true);
+ }
}
public function setExtensions(array $a)
@@ -384,6 +396,11 @@ class MalwareScanner
$this->flagScanEverything = $b;
}
+ public function setFlagCombinedWhitelist($b)
+ {
+ $this->flagCombinedWhitelist = $b;
+ }
+
// @see http://stackoverflow.com/a/13914119
private function pathMatches($path, $pattern, $ignoreCase = false)
{
@@ -558,24 +575,33 @@ class MalwareScanner
echo 'Total malware identified: ' . $this->stat['files_infected'] . PHP_EOL;
}
- //Validates the input directory
- //Calls the load pattern and load whitelist functions
- //Calls the process and report functions.
+ /**
+ * Validates the input directory
+ *
+ * - Calls the load pattern and load whitelist functions
+ * - Fetch and load combined whitelist
+ * - Calls the process and report functions.
+ *
+ * @param $dir
+ * @return bool
+ */
public function run($dir)
{
- //Make sure the input is a valid directory path.
+ // Make sure the input is a valid directory path.
$dir = rtrim($dir, '/');
if (!is_dir($dir)) {
$this->error('Specified path is not a directory: ' . $dir);
return false;
}
- //Load Patterns
$this->initializePatterns();
- //Load Whitelist
$this->loadWhitelist();
+ if ($this->flagCombinedWhitelist && !$this->updateCombinedWhitelist()) {
+ return false;
+ }
+
$start = time();
$this->process($dir . '/');
$this->report($start, $dir . '/');
@@ -671,6 +697,79 @@ class MalwareScanner
}
}
+ // @see https://www.mkwd.net/binary-search-algorithm-in-php/
+ private function binarySearch($needle, array $haystack, $high, $low = 0)
+ {
+ $key = false;
+ // Whilst we have a range. If not, then that match was not found.
+ while ($high >= $low) {
+ // Find the middle of the range.
+ $mid = (int)floor(($high + $low) / 2);
+ // Compare the middle of the range with the needle. This should return <0 if it's in the first part of the range,
+ // or >0 if it's in the second part of the range. It will return 0 if there is a match.
+ $cmp = strcmp($needle, $haystack[$mid]);
+ // Adjust the range based on the above logic, so the next loop iteration will use the narrowed range
+ if ($cmp < 0) {
+ $high = $mid - 1;
+ } elseif ($cmp > 0) {
+ $low = $mid + 1;
+ } else {
+ $key = $mid;
+ break;
+ }
+ }
+
+ return $key;
+ }
+
+ private function updateCombinedWhitelist($url = 'https://scr34m.github.io/php-malware-scanner')
+ {
+ $latest_hash = trim(file_get_contents($url . '/database/compressed.sha256'));
+ if ($latest_hash === false) {
+ $this->error('Unable to download database checksum');
+ return false;
+ }
+
+ $file = __DIR__ . '/whitelist.dat';
+ if (is_readable($file)) {
+ $hash = hash_file('sha256', $file);
+ if ($hash != $latest_hash) {
+ $download = true;
+ } else {
+ $download = false;
+ }
+ } else {
+ $download = true;
+ }
+
+ if ($download) {
+ $data = file_get_contents($url . '/database/compressed.dat');
+ if ($data === false) {
+ $this->error('Unable to download database');
+ return false;
+ }
+
+ file_put_contents($file, $data);
+ $hash = hash_file('sha256', $file);
+ if ($hash != $latest_hash) {
+ $this->error('Downloaded database hash mismatch');
+ }
+ }
+
+ $content = gzdecode(file_get_contents($file));
+ $this->combined_whitelist = [];
+ $this->combined_whitelist_count = 0;
+ foreach (explode("\n", $content) as $line) { // faster than strtok, but needs more memory
+ if ($line) {
+ $this->combined_whitelist[] = $line;
+ $this->combined_whitelist_count++;
+ }
+ }
+ $this->combined_whitelist_count -= 1; // -1 because we use indexes in binary search
+ echo 'Combined whitelist records count: ' . ($this->combined_whitelist_count + 1) . PHP_EOL;
+ return true;
+ }
+
//Prints out the usage menu options.
private function showHelp()
{
@@ -695,6 +794,7 @@ class MalwareScanner
echo ' -L --line-number Display matching pattern line number in file' . PHP_EOL;
echo ' -o --output-format Custom defined output format' . PHP_EOL;
echo ' -j --wordpress-version Version of wordpress to get md5 signatures' . PHP_EOL;
+ echo ' --combined-whitelist Combined whitelist' . PHP_EOL;
}
diff --git a/tools/bigdata/generate.php b/tools/bigdata/generate.php
new file mode 100644
index 0000000..81386ce
--- /dev/null
+++ b/tools/bigdata/generate.php
@@ -0,0 +1,92 @@
+\s+JSON.*?<\/a>/is', $data, $m);
+ foreach ($m[1] as $url) {
+ $file = str_replace(['/', '.'], '_', $url); // fix file name
+ if (!is_cached($file . '.json')) {
+ echo 'Downloading: ' . 'https://checksums.kubik-rubik.de/' . $url . PHP_EOL;
+ $json = file_get_contents('https://checksums.kubik-rubik.de/' . $url);
+ set_cache($file . '.json', $json);
+ } else {
+ $json = get_cache($file . '.json');
+ }
+
+ // decode json, write hash values to one file
+ echo 'Parsing json file: ' . $file . PHP_EOL;
+ foreach (json_decode($json) as $k => $hash) {
+ fputs($fp, $hash . "\n");
+ }
+ }
+ }
+}
+
+function fetch_jquery($fp)
+{
+ echo 'Fetching jQuery' . PHP_EOL;
+ $data = file_get_contents('https://code.jquery.com/jquery/');
+
+ preg_match_all('/ compressed.dat'); // gzencode
+
+$hash = hash_file('sha256', 'compressed.dat');
+file_put_contents('compressed.sha256', $hash);
+echo 'SHA256 is ' . $hash . PHP_EOL;
\ No newline at end of file
diff --git a/tools/jquery.py b/tools/jquery.py
deleted file mode 100644
index 2b00bd5..0000000
--- a/tools/jquery.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env python
-
-import urllib2
-import re
-import hashlib
-
-def fetch(url):
- response = urllib2.urlopen(url)
- return response.read()
-
-def main():
- html = fetch('https://code.jquery.com/jquery/')
- regex = re.compile(r"