TRUE); /** * The IslandoraFedoraObject containing the datastream to test. * * @var IslandoraFedoraObject */ public $object; /** * The DSID of the string to test. * * @var string */ public $datastream; /** * The content of the datastream. * * @var string[] */ public $datastreamContent; /** * An associative array of messages returned from passed tests, and callers. * * This should only be added to using $this->addPass(), so that the caller can * be appropriately determined. * * @var array */ public $passes = array(); /** * An associative array of messages returned from failed tests, and callers. * * This should only be added to using $this->addFail(), so that the caller can * be appropriately determined. * * @var array */ public $fails = array(); /** * An array of additional required parameters. * * @var array */ public $params = array(); /** * Constructs a DatastreamValidator. * * @param IslandoraFedoraObject $object * The object to grab the datastream from. * @param string $datastream * The DSID of the datastream itself. * @param array $params * An extra array of parameters the validator might need. */ public function __construct($object, $datastream, array $params = array()) { $this->object = $object; $this->datastream = $datastream; $this->params = $params; $this->datastreamContent = $object[$datastream]->content; $this->runValidators(); } /** * Helper function to run all the validators in a class. * * On DatastreamValidator::__construct(), this looks for any functions * within the class beginning in "assert" and runs them. In all current cases * (and realistically in all future cases), this adds one or more passes or * fails to $this->passes and/or $this->fails. */ public function runValidators() { $methods = get_class_methods($this); foreach ($methods as $method) { if (substr($method, 0, 6) === 'assert') { $this->$method(); } } } /** * Returns an array of pass messages. * * @return string[] * The pass messages. */ public function getPasses() { return $this->passes; } /** * Returns an array of fail messages. * * @return string[] * The fail messages. */ public function getFails() { return $this->fails; } /** * Adds a pass to $this->pass. * * Passes are an associative array of messages and callers. Callers should be * obtained using $this->getAssertionCall(). * * @param string $message * The message to use. */ public function addPass($message) { $this->passes[$message] = $this->getAssertionCall(); } /** * Adds a fail to $this->fail. * * Fails are an associative array of messages and callers. Callers should be * obtained using $this->getAssertionCall(). * * @param string $message * The message to use. */ public function addFail($message) { $this->fails[$message] = $this->getAssertionCall(); } /** * Cycles through backtrace until the first non-assertion method is found. * * This is a manipulated version of DrupalWebTestCase::getAssertionCall(). * We use it here so that we can pass back assertion calls from * DatastreamValidator assertions instead of less useful TestCase functions. * * @return array * Array representing the true caller. */ protected function getAssertionCall() { $backtrace = debug_backtrace(); // While the current caller's function starts with 'assert', and another one // exists after this function, keep poppin' em off. while (substr($backtrace[1]['function'], 0, 6) !== 'assert' && isset($backtrace[2])) { array_shift($backtrace); } return _drupal_get_last_caller($backtrace); } } /** * Asserts that an object's given datastreams are common-type image files. * * Uses PHPGD to run the assertion check. This means that only certain kinds * of image files can be checked. Please check the documentation for the PHPGD * imagecreatefromstring() function to determine what filetypes are valid. */ class ImageDatastreamValidator extends DatastreamValidator { /** * Asserts the validity of an image using PHPGD. */ protected function assertImageGeneration() { if (imagecreatefromstring($this->datastreamContent) !== FALSE) { $this->addPass("Image datastream {$this->datastream} is valid."); } else { $this->addFail("Image datastream {$this->datastream} is either invalid or corrupt."); } } } /** * Asserts the validity of any .tif/.tiff datastream. */ class TIFFDatastreamValidator extends DatastreamValidator { /** * Asserts that the TIFF contains an appropriate header. */ public function assertTIFFHeaderHex() { $datastream_header_hex = self::getTIFFHeaderHex(); if ($datastream_header_hex == "49492a00") { // In this case, the ingested TIFF is designated as using the "Intel // byte-order" (i.e. little-endian) by starting with the characters "II" // (repeated so that byte order does not yet need to be significant). // The number that follows is '42' in little-endian hex, a number of // 'deep philosophical significance' to the TIFF format creators. $this->addPass("{$this->datastream} datastream asserts that it is a valid Intel-byte-orderded TIF/TIFF file."); } elseif ($datastream_header_hex == "4d4d002a") { // In this case, the ingested TIFF is designated as using the "Motorola // byte-order" (i.e. big-endian) by starting with the characters "MM" // instead. 42 follows once again, this time in big-endian hex. $this->addPass("{$this->datastream} datastream asserts that it is a valid Motorola-byte-ordered TIF/TIFF file."); } else { $this->addFail("{$this->datastream} datastream does not assert that it is a valid TIF/TIFF file."); } } /** * Grabs the first 8 characters from the TIFF datastream's hex. * * @return string * The ... thing I just wrote up there. */ protected function getTIFFHeaderHex() { return substr(bin2hex($this->datastreamContent), 0, 8); } } /** * Asserts the validity of a JP2 datastream. */ class JP2DatastreamValidator extends DatastreamValidator { /** * Asserts the hex values at the head of the JP2 file. * * JP2 files begin with an offset header at the second 32-bit integer, * 0x6A502020. This header is in all .jp2s, and we check for it here. */ protected function assertJP2Header() { if (substr(bin2hex($this->datastreamContent), 8, 8) == '6a502020') { $this->addPass("Datastream {$this->datastream} contains the appropriate JP2 header."); } else { $this->addFail("Datastream {$this->datastream} does not contain the appropriate JP2 header."); } } /** * Asserts the marker at the end of the JP2 file. * * JP2 files have their codestream capped with a marker, 0xFFD9. We're just * checking for it here to see if the .jp2 encoder finished okay. */ protected function assertJP2Marker() { if (substr(bin2hex($this->datastreamContent), strlen(bin2hex($this->datastreamContent)) - 4, 4) == 'ffd9') { $this->addPass("Datastream {$this->datastream} contains the appropriate JP2 ending marker."); } else { $this->addFail("Datastream {$this->datastream} does not contain the appropriate JP2 ending marker. If this is the only JP2 validator that failed, it is likely that derivative generation was interrupted."); } } } /** * Asserts the validity of a PDF datastream. */ class PDFDatastreamValidator extends DatastreamValidator { /** * Validates the PDF signature. */ protected function assertPDFSignature() { if (substr($this->datastreamContent, 0, 5) == '%PDF-') { $pdf_version = substr($this->datastreamContent, 5, 3); $this->addPass("{$this->datastream} datastream asserts that it is a valid PDF file using PDF version {$pdf_version}"); } else { $this->addFail("{$this->datastream} datastream binary header appears to be corrupt and missing a valid PDF signature."); } } /** * Counts the number of signatures in this PDF file and asserts there are any. */ protected function assertPDFStreamCount() { $pdf_stream_count = substr_count(bin2hex($this->datastreamContent), '0a73747265616d0a'); if ($pdf_stream_count !== 0) { $this->addPass("{$this->datastream} datastream reports the existence of {$pdf_stream_count} PDF streams. Note that an extremely low number could still indicate corruption."); } else { $this->addFail("{$this->datastream} datastream contains zero PDF streams, and is likely not a PDF file."); } } /** * Validates the PDF closing tag. * * @return bool * TRUE if it was present; FALSE otherwise. */ protected function assertPDFClosingTag() { if (strpos(bin2hex($this->datastreamContent), '0a2525454f460a')) { $this->addPass("{$this->datastream} datastream reports the existence of the closing 'EOF' tag required at the end of PDFs"); } else { $this->addFail("{$this->datastream} datastream does not contain the closing 'EOF' tag. If this is the only PDF validation that failed, it is likely that derivative generation was interrupted."); } } } /** * Validates the number of times a string occurs in a datastream. * * Requires $this->params to be set to an array containing two keys - the first * is the string we're looking to find in the datastream, and the second is an * integer representing the number of times it should appear in the datastream. */ class TextDatastreamValidator extends DatastreamValidator { /** * Asserts that the string given appears the correct number of times. */ protected function assertTextStringCount() { if (!isset($this->params[1])) { $this->addFail("TextDatastreamValidator cannot be instantiated without two keys in the 'params' variable."); return; } $string_count = self::getTextStringCount(); $expected = $this->params[1]; $function = $string_count === $expected ? 'addPass' : 'addFail'; $this->$function("{$this->datastream} datastream contains the word(s) '{$this->params[0]}' repeated {$string_count} time(s) (expected: {$expected})."); } /** * The number of times key [0] in $this->params appears in the datastream. * * @return int * That count I just mentioned up there. */ protected function getTextStringCount() { return substr_count($this->datastreamContent, $this->params[0]); } } /** * Asserts the validity a WAV datastream. * * WAV files contain a rigidly detailed header that contains all sorts of fun * information we can use to validate things against other things. So, we check * rigorously that the header contains properly constructed data by looking to * see if certain values are at their expected byte offset. We also compare * declared chunk sizes against actual sizes. If any of these are off, WAV * players will fail to function. */ class WAVDatastreamValidator extends DatastreamValidator { /** * We need a special constructor here to get the hex datastream content. * * @param IslandoraFedoraObject $object * The object to grab the datastream from. * @param string $datastream * The DSID of the datastream itself. * @param array $params * An extra array of parameters the validator might need. */ public function __construct($object, $datastream, array $params = array()) { $this->object = $object; $this->datastream = $datastream; $this->params = $params; $this->datastreamContent = bin2hex($object[$datastream]->content); $this->runValidators(); } /** * Asserts that the datastream contains a valid WAV signature. */ protected function assertWAVSignature() { $signatures = str_split(substr($this->datastreamContent, 0, 24), 8); if ($signatures[0] = '52494646' && $signatures[2] = '57415645') { $this->addPass("Header of the {$this->datastream} datastream contains a valid file signature."); } else { $this->addFail("Header of the {$this->datastream} datastream contains corrupt file signature."); } } /** * Asserts that the chunksize in the header is correct. */ protected function assertWAVChunkSize() { if (islandora_hex2int(substr($this->datastreamContent, 8, 8)) === 36 + self::getDataSubChunkSize()) { $this->addPass("{$this->datastream} datastream chunksize in WAV header is correct"); } else { $this->addFail("{$this->datastream} datastream chunksize in WAV header does not match actual chunksize."); } } /** * Asserts that the datastream contains a 'fmt' subchunk. */ protected function assertWAVFmtSubChunk() { if (substr($this->datastreamContent, 24, 8) === '666d7420') { $this->addPass("{$this->datastream} datastream contains a 'fmt' subchunk."); } else { $this->addFail("{$this->datastream} datastream is missing the required 'fmt' subchunk."); } } /** * Asserts that the byterate reported by the WAV header is valid. */ protected function assertWAVByteRate() { $wav_samplerate = islandora_hex2int(substr($this->datastreamContent, 48, 8)); if (islandora_hex2int(substr($this->datastreamContent, 56, 8)) === $wav_samplerate * self::getNumChannels() * self::getBytesPerSample()) { $this->addPass("{$this->datastream} datastream byterate in the WAV header is correct."); } else { $this->addFail("{$this->datastream} datastream byterate in the WAV header does not match actual calculated byterate."); } } /** * Asserts that the block alignment is correct. */ protected function assertWAVBlockAlignment() { if (islandora_hex2int(substr($this->datastreamContent, 64, 4)) === self::getNumChannels() * self::getBytesPerSample()) { $this->addPass("{$this->datastream} datastream block alignment is set correctly."); } else { $this->addFail("{$this->datastream} datastream block alignment is off."); } } /** * Asserts the existence of a 'data' subchunk. * * Also asserts that the subchunk size is correct. */ protected function assertWAVDataSubChunk() { if (substr($this->datastreamContent, 72, 8) !== '64617461') { $this->addFail("{$this->datastream} datastream is missing the 'data' subchunk."); return; } else { $this->addPass("{$this->datastream} datastream contains 'data' subchunk."); $wav_numsamples = strlen(substr($this->datastreamContent, 88)) / self::getNumChannels() / self::getBytesPerSample() / 2; if (self::getDataSubChunkSize() === $wav_numsamples * self::getNumChannels() * self::getBytesPerSample()) { $this->addPass("{$this->datastream} datastream 'data' chunk is the correct size."); } else { $this->addFail("{$this->datastream} datastream 'data' chunk is sized incorrectly."); } } } /** * Gets the number of channels reported by the WAV header. * * @return int * The number of channels reported by the datastream header. */ protected function getNumChannels() { return islandora_hex2int(substr($this->datastreamContent, 44, 4)); } /** * Gets the reported number of byte rates per sample. * * @return int * The number of bytes per sample reported by the datastream header. */ protected function getBytesPerSample() { return islandora_hex2int(substr($this->datastreamContent, 68, 4)) / 8; } /** * Gets the size of the 'data' subchunk. * * @return int * The size of the 'data' subchunk. */ protected function getDataSubChunkSize() { return islandora_hex2int(substr($this->datastreamContent, 80, 8)); } } /** * Asserts the validity of any .mp3 datastream. * * Our default setup tries to create an MP3 using VBR, but we do some extra * checks in case someone turns that off. If the header contains the characters * 'Xing', it is flagged as VBR, and we can do an in-depth check on each of the * VBR settings. Otherwise, we look for the basic MP3 signature 'fffa' or 'fffb' * at the start of the binary. */ class MP3DatastreamValidator extends DatastreamValidator { /** * Asserts the validity of the MP3. * * The MP3 file format is a bit of a mess; the entire makeup of the file * depends on whether it uses variable bit rate or static bit rate. So, I'm * breaking my own rules here and using a single assert function so that I * can handle the weird logic. */ protected function assertValidMP3() { $this->datastreamContent = bin2hex($this->datastreamContent); // If it's not a VBR MP3, we don't have to check much, so let's get that // out of the way first before we go doing a bunch of potentially pointless // math. Check to see if the VBR flag (58696e67) isn't there. if (strpos($this->datastreamContent, '58696e67') == FALSE && substr($this->datastreamContent, 0, 4) == 'fffa') { $this->addPass("{$this->datastream} datastream is encoded as a valid MPEG-1 Layer 3 file with CRC protection"); return; } if (strpos($this->datastreamContent, '58696e67') == FALSE && substr($this->datastreamContent, 0, 4) == 'fffb') { $this->addPass("{$this->datastream} datastream is encoded as a valid unprotected MPEG-1 Layer 3 file"); return; } // And what if the flag IS set? if (strpos($this->datastreamContent, '58696e67')) { // Check the field flags. VBR-formatted MP3 files contain a 32-bit // integer (stored as $mp3_flag_value) that is a combination of four // bits, each one indicating the on-off status of a VBR setting, via // logical OR. Rather than disassembling this value into individual // bits, we use the algorithm "if (binary_total+bit_value*2)/bit_value*2 // is greater than or equal to bit_value, that bit is turned on" to find // the status of each bit, so we know whether to offset the rest. $mp3_field_offset = array(0, 0, 0); $mp3_vbrheader = substr($this->datastreamContent, strpos($this->datastreamContent, '58696e67'), 240); $mp3_flag_value = hexdec(substr($mp3_vbrheader, 8, 8)); // We can't use the first flag, but we still need to offset the rest. if (($mp3_flag_value + 1) % 2 == 0) { $mp3_field_offset[0] += 8; $mp3_field_offset[1] += 8; $mp3_field_offset[2] += 8; } // The second flag leads us to filesize data, which we can verify. if (($mp3_flag_value + 4) % 4 > 1) { $mp3_field_bytes = hexdec(substr($mp3_vbrheader, $mp3_field_offset[0] + 16, 8)); $mp3_size = strlen($this->datastreamContent) / 2; if ($mp3_size == $mp3_field_bytes) { $this->addPass("{$this->datastream} datastream reported filesize of {$mp3_size} bytes matches size field value of {$mp3_field_bytes}"); } else { $this->addFail("{$this->datastream} datastream reported filesize of {$mp3_size} bytes does not match size field value of {$mp3_field_bytes}"); } $mp3_field_offset[1] += 8; $mp3_field_offset[2] += 8; } // We can't use the third flag for anything, but we still have to offset. if (($mp3_flag_value + 8) % 8 > 3) { $mp3_field_offset[2] += 200; } // The fourth flag leads us to VBR quality data, which we can validate. if ($mp3_flag_value > 7) { $mp3_field_quality = hexdec(substr($mp3_vbrheader, $mp3_field_offset[2] + 16, 8)); if ($mp3_field_quality <= 100 && $mp3_field_quality >= 0) { $this->addPass("{$this->datastream} datastream reports valid VBR quality of {$mp3_field_quality} (expected: between 0-100)"); } else { $this->addFail("{$this->datastream} datastream reports invalid VBR quality of {$mp3_field_quality} (expected: between 0-100)"); } } } // If none of that works out, fail. else { $this->addFail("{$this->datastream} datastream is corrupt and does not identify as a valid MP3."); } } } /** * Attempts to validate an .mp4 datastream. * * MP4 files are a subset of the ISO file format specification, and as such need * to contain a 64-bit declaration of type within the first eight eight bytes of * the file. This declaration is comprised of the characters 'ftyp', followed by * a four-character filetype code. Here, we look for 'ftyp', and then pass the * filetype code to the test message. */ class MP4DatastreamValidator extends DatastreamValidator { /** * Asserts that the datastream is ISO-formatted video. */ protected function assertISOVideo() { if (strpos($this->datastreamContent, 'ftyp')) { $mp4_ftyp = substr(strpos($this->datastreamContent, 'ftyp'), 4, 4); $this->addPass("{$this->datastream} datastream asserts that it is a valid ISO-formatted video file using ftyp {$mp4_ftyp}"); } else { $this->addFail("{$this->datastream} datastream is not a valid ISO-formatted video"); } } } /** * Attempts to validate an .ogg/ogv datastream using Vorbis and Theora encoding. * * OGG files are made up of several 'pages' of OGG data, each prefaced with an * OGG marker - the letters 'OggS'. The file header also contains information on * what encoders were used to create the file. Here, we're looking for at least * one OGG page, and confirming that the file asserts the Theora and Vorbis * codecs were used to create the file. */ class OGGDatastreamValidator extends DatastreamValidator { /** * Asserts that the datastream contains ogg pages. */ protected function assertOGGPages() { $ogg_pages = substr_count($this->datastreamContent, 'OggS'); if ($ogg_pages !== 0) { $this->addPass("{$this->datastream} datastream asserts that it contains {$ogg_pages} Ogg pages (even a very small file should contain several)."); } else { $this->addFail("{$this->datastream} datastream contains no Ogg pages."); } } /** * Asserts that the datastream contains Theora-encoded video. */ protected function assertTheoraVideo() { if (substr_count($this->datastreamContent, 'theora') !== 0) { $this->addPass("{$this->datastream} datastream asserts that it contains Theora-encoded video data."); } else { $this->addFail("{$this->datastream} datastream contains no marker indicating the presence of Theora-encoded video data."); } } /** * Asserts that the datastream contains Vorbis-encoded audio. */ protected function assertVorbisAudio() { if (substr_count($this->datastreamContent, 'vorbis')) { $this->addPass("{$this->datastream} datastream asserts that it contains Vorbis-encoded audio data"); } else { $this->addFail("{$this->datastream} datastream contains no marker indicating the presence of Vorbis-encoded audio data."); } } } /** * Attempts to validate an .mkv datastream. * * There's not much we can do to check an MKV file, since the format is really, * really loose. We do know a couple of things though - first, since MKV is an * EBML format, the first four characters will always be the same. Since they're * non-standard characters, we're looking at their hex values instead. And * second, we know that the file will contain the declaration 'matroska' soon * after. */ class MKVDatastreamValidator extends DatastreamValidator { /** * Asserts that the datastream is an EBML-format file. */ protected function assertEBMLFormat() { if (substr(bin2hex($this->datastreamContent), 0, 8) == '1a45dfa3') { $this->addPass("{$this->datastream} datastream asserts that it is an EBML-formatted file"); } else { $this->addFail("{$this->datastream} datastream is not an EBML-formatted file."); } } /** * Asserts that the datastream contains a matroska marker. */ protected function assertMatroskaMarker() { if (substr_count($this->datastreamContent, 'matroska') == 1) { $this->addPass("{$this->datastream} datastream asserts that its EBML DocType is Matroska"); } else { $this->addFail("{$this->datastream} datastream does not contain a Matroska EBML DocType marker."); } } }