hash.h

Go to the documentation of this file.
00001 /**
00002  *  Copyright (C) 2004-2005 Alo Sarv <madcat_@users.sourceforge.net>
00003  *
00004  *  This program is free software; you can redistribute it and/or modify
00005  *  it under the terms of the GNU General Public License as published by
00006  *  the Free Software Foundation; either version 2 of the License, or
00007  *  (at your option) any later version.
00008  *
00009  *  This program is distributed in the hope that it will be useful,
00010  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *  GNU General Public License for more details.
00013  *
00014  *  You should have received a copy of the GNU General Public License
00015  *  along with this program; if not, write to the Free Software
00016  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00017  */
00018 
00019 /** \file hash.h Interface for Hash<>, HashSet<> and their base classes. */
00020 
00021 #ifndef __HASH_H__
00022 #define __HASH_H__
00023 
00024 #include <hn/utils.h>
00025 #include <hn/log.h>
00026 #include <hn/osdep.h>
00027 #include <stdexcept>
00028 #include <iosfwd>
00029 #include <string>
00030 #include <vector>
00031 
00032 /**
00033  * @page hashes Hash System Overview
00034  *
00035  * A Hash represents what is generally known as a "checksum". There are roughly
00036  * 10 widely-used checksumming algorithms widely used, and we will sooner or
00037  * later need to implement most of them in HydraNode core. As such, we need
00038  * some way of storing all kinds of checksums, of arbitary lengths, while
00039  * keeping type-safety. Additionally, we want to store the checksums in
00040  * lists of checksums, for example for files checksum sets per ranges.
00041  *
00042  * In order to accomplish that, the following has been implemented:
00043  *
00044  * - HashBase is abstract and provides pure virtual accessors for various Hash
00045  *   internals. The need for this class is by HashSetBase class, which needs to
00046  *   return Hash objects without knowing their exact types.
00047  * - Hash<> class is a concrete implementation of a checksum wrapper. It is
00048  *   parametered with specific Hash type (the pattern is generally known as
00049  *   template Policy, however we do not exploit the full potential of it here).
00050  *   Basically what we attempted to achieve here was to have compiler generate
00051  *   specific Hash classes for each of the implemented hashes, without us
00052  *   having to write each of them by hand (or using runtime checking, which
00053  *   derivation could give us). With this design, we get a concrete special
00054  *   class for each and every different Hash we create, thus attempts to mix
00055  *   different Hash objects in containers et al will result in compile-time
00056  *   errors.
00057  * - HashSetBase is abstract and provides pure virtual accessors for various
00058  *   HashSet-related functions. The reason for this class's existance is that
00059  *   we will later need to store different hashsets in same container, which
00060  *   breaks compile-time typesafety since HashSet<> is template. This is also
00061  *   the reason why we needed HashBase classe - since this class doesn't
00062  *   know about the specific Hashes it contains (its implemented by derivation),
00063  *   it can only return HashBase objects.
00064  * - HashSet<> template class is container for specific Hash objects. HashSet
00065  *   object may contain a list of chunk hashes, and master-hash (filehash).
00066  *   The hash types of those two may differ, however attempts to add
00067  *   different Hash types into container will result in compile-time errors.
00068  *
00069  */
00070 
00071 class HashBase;
00072 class HashSetBase;
00073 
00074 namespace CGComm {
00075         //! Hash types and the relevant op codes
00076         enum HashTypeId {
00077                 OP_HASH         = 0xa0,      //!< Hash object
00078                 OP_HT_ADLER     = 0xa1,      //!< length =  4
00079                 OP_HT_CRC32     = 0xa2,      //!< length =  4
00080                 OP_HT_ED2K      = 0xa3,      //!< length = 16
00081                 OP_HT_MD4       = 0xa4,      //!< length = 16
00082                 OP_HT_MD5       = 0xa5,      //!< length = 20
00083                 OP_HT_PANAMA    = 0xa6,      //!< length = 32
00084                 OP_HT_RIPEMD160 = 0xa7,      //!< length = 20
00085                 OP_HT_SHA1      = 0xa8,      //!< length = 20
00086                 OP_HT_SHA256    = 0xa9,      //!< length = 32
00087                 OP_HT_SHA384    = 0xaa,      //!< length = 48
00088                 OP_HT_SHA512    = 0xab,      //!< length = 64
00089                 OP_HT_TIGER     = 0xac,      //!< length = 24
00090                 OP_HT_UUHASH    = 0xad,      //!< length = ??
00091                 OP_HT_UNKNOWN   = 0xff       //!< unknown/userdefined/invalid
00092         };
00093 
00094         //! Load a hash from stream.
00095         HashBase* loadHash(std::istream &i);
00096         //! Load HashSet from stream
00097         HashSetBase* loadHashSet(std::istream &i);
00098 }
00099 
00100 //! Abstract base for hash
00101 class DLLEXPORT HashBase {
00102 public:
00103         HashBase();
00104         virtual ~HashBase();
00105         virtual uint16_t size() const = 0;
00106         virtual boost::shared_array<char> getData() const = 0;
00107         virtual std::string getType() const = 0;
00108         virtual CGComm::HashTypeId getTypeId() const = 0;
00109         bool isEmpty() const { return getData() ? false : true; }
00110 
00111         virtual std::string decode() const {
00112                 if (isEmpty()) {
00113                         return std::string();
00114                 }
00115                 return Utils::decode(getData().get(), size());
00116         }
00117 
00118         //! Output operator for streams
00119         friend DLLEXPORT std::ostream& operator<<(
00120                 std::ostream &o, const HashBase &h
00121         );
00122 
00123         //! Comparison operator
00124         bool operator==(const HashBase &h) const {
00125                 if (h.getTypeId() != getTypeId()) {
00126                         return false;
00127                 }
00128                 if (!getData() && !h.getData()) { return true; }
00129                 if (
00130                         (getData() && !h.getData()) ||
00131                         (!getData() && h.getData())
00132                 ) {
00133                         return false;
00134                 }
00135                 return !memcmp(h.getData().get(), getData().get(), size());
00136         }
00137 
00138         bool operator!=(const HashBase &h) const {
00139                 return !(*this == h);
00140         }
00141 
00142         friend bool operator<(const HashBase &x, const HashBase &y) {
00143                 if (!x.getData() && !y.getData()) {
00144                         return true;
00145                 }
00146                 if (
00147                         (x.getData() && !y.getData())
00148                         || (!x.getData() && y.getData())
00149                         || x.size() != y.size()
00150                 ) {
00151                         return false;
00152                 }
00153                 return memcmp(
00154                         x.getData().get(), y.getData().get(), x.size()
00155                 ) < 0;
00156         }
00157         //! Comparison operator to bool
00158         operator bool() const { return getData(); }
00159 };
00160 
00161 //! Concrete implementation for Hash
00162 template<typename HashType>
00163 class Hash : public HashBase {
00164 public:
00165         //! Default constructor
00166         Hash() {}
00167         //! Construct and read from stream
00168         Hash(std::istream &i) {
00169                 m_data.reset(new char[HashType::size()]);
00170                 i.read(m_data.get(), HashType::size());
00171         }
00172 
00173         //! Construct from character array
00174         Hash(const char *data) {
00175                 m_data.reset(new char[HashType::size()]);
00176                 memcpy(m_data.get(), data, HashType::size());
00177         }
00178         Hash(const unsigned char *data) {
00179                 m_data.reset(new char[HashType::size()]);
00180                 memcpy(m_data.get(), data, HashType::size());
00181         }
00182         Hash(const boost::shared_array<char> &data) : m_data(data) {}
00183 
00184         //! Construct from character array and size - asserts if length !=
00185         //! HashType::size()
00186         Hash(const char *data, uint16_t length) {
00187                 assert(length == HashType::size());
00188                 m_data.reset(new char[HashType::size()]);
00189                 memcpy(m_data.get(), data, HashType::size());
00190         }
00191         Hash(const unsigned char *data, uint16_t length) {
00192                 assert(length == HashType::size());
00193                 m_data.reset(new char[HashType::size()]);
00194                 memcpy(m_data.get(), data, HashType::size());
00195         }
00196         Hash(const std::string &data) {
00197                 assert(data.size() == HashType::size());
00198                 m_data.reset(new char[HashType::size()]);
00199                 memcpy(m_data.get(), data.data(), HashType::size());
00200         }
00201         Hash(const boost::shared_array<char> *data) : m_data(data) {}
00202 
00203         //! Destructor
00204         ~Hash() {}
00205 
00206         // compiler-generated copy-constructor and assignment operators are ok
00207 
00208         //! Accessors
00209         //@{
00210         uint16_t size() const { return HashType::size(); }
00211         boost::shared_array<char> getData() const { return m_data; }
00212         std::string getType() const { return HashType::getType(); }
00213         CGComm::HashTypeId getTypeId() const { return HashType::getTypeId(); }
00214         //@}
00215 
00216         /**
00217          * Clear the contents of this hash
00218          */
00219         void clear() { m_data.reset(); }
00220 
00221         //! @name Operations
00222         //@{
00223         bool operator==(const Hash &h) const {
00224                 if (!m_data && !h.m_data) { return true; }
00225                 if ((m_data && !h.m_data) || (!m_data && h.m_data)) {
00226                         return false;
00227                 }
00228                 return !memcmp(h.m_data.get(), m_data.get(), HashType::size());
00229         }
00230         bool operator!=(const Hash &h) const {
00231                 if (!m_data && !h.m_data) { return false; }
00232                 if ((m_data && !h.m_data) || (!m_data && h.m_data)) {
00233                         return true;
00234                 }
00235                 return memcmp(h.m_data.get(), m_data.get(), HashType::size());
00236         }
00237 
00238         //! Comparison operator to bool
00239         operator bool() const { return m_data; }
00240 
00241         friend bool operator<(const Hash &x, const Hash &y) {
00242                 if (!x.m_data && !y.m_data) {
00243                         return true;
00244                 }
00245                 if ((!x.m_data && y.m_data) || (x.m_data && !y.m_data)) {
00246                         return false;
00247                 }
00248                 return memcmp(
00249                         x.m_data.get(), y.m_data.get(), HashType::size()
00250                 ) < 0;
00251         }
00252         //@}
00253 private:
00254         boost::shared_array<char> m_data;        //!< Internal data storage
00255 };
00256 
00257 //! MD4Hash specification
00258 class DLLEXPORT MD4Hash {
00259 public:
00260         static uint16_t size()                { return 16;                }
00261         static std::string getType()          { return "MD4Hash";         }
00262         static CGComm::HashTypeId getTypeId() { return CGComm::OP_HT_MD4; }
00263 };
00264 //! MD5Hash specification
00265 class DLLEXPORT MD5Hash {
00266 public:
00267         static uint16_t size()                { return 16;                }
00268         static std::string getType()          { return "MD5Hash";         }
00269         static CGComm::HashTypeId getTypeId() { return CGComm::OP_HT_MD5; }
00270 };
00271 //! ED2KHash specification
00272 class DLLEXPORT ED2KHash {
00273 public:
00274         static uint16_t size()                { return 16;                 }
00275         static std::string getType()          { return "ED2KHash";         }
00276         static CGComm::HashTypeId getTypeId() { return CGComm::OP_HT_ED2K; }
00277 };
00278 //! SHA1Hash specification
00279 class DLLEXPORT SHA1Hash {
00280 public:
00281         static uint16_t size()                { return 20;                 }
00282         static std::string getType()          { return "SHA1Hash";         }
00283         static CGComm::HashTypeId getTypeId() { return CGComm::OP_HT_SHA1; }
00284 };
00285 
00286 namespace CGComm {
00287         enum HashSetTypeIds {
00288                 OP_HASHSET     = 0xc0,         //!< Hashset
00289                 OP_HS_FILEHASH = 0xc1,         //!< <hash>filehash
00290                 OP_HS_PARTHASH = 0xc2,         //!< <hash>chunkhash
00291                 OP_HS_PARTSIZE = 0xc3          //!< <uint32_t>chunksize
00292         };
00293 }
00294 
00295 /**
00296  * Abstract base class representing a Hash Set. Provides pure virtual functions
00297  * which derived classes must override to create concrete HashSet types.
00298  */
00299 class DLLEXPORT HashSetBase {
00300 public:
00301         HashSetBase() {}
00302         virtual ~HashSetBase() {}
00303 
00304         /**
00305          * @name Pure virtual accessors
00306          */
00307         //@{
00308         virtual const HashBase&    getFileHash()              const = 0;
00309         virtual uint32_t           getChunkCnt()              const = 0;
00310         virtual const HashBase&    getChunkHash(uint32_t num) const = 0;
00311         virtual uint32_t           getChunkSize()             const = 0;
00312         virtual std::string        getFileHashType()          const = 0;
00313         virtual CGComm::HashTypeId getFileHashTypeId()        const = 0;
00314         virtual std::string        getChunkHashType()         const = 0;
00315         virtual CGComm::HashTypeId getChunkHashTypeId()       const = 0;
00316         //@}
00317 
00318         // Compares two hashets. Returns true if they are equal, false otherwise
00319         bool compare(const HashSetBase &ref) const;
00320 
00321         //! Output operator for streams
00322         friend std::ostream& operator<<(std::ostream &o, const HashSetBase &h);
00323 
00324         //! Inequality operator
00325         bool operator!=(const HashSetBase &ref) const {
00326                 return !compare(ref);
00327         }
00328 
00329         //! Equality operator
00330         bool operator==(const HashSetBase &ref) const {
00331                 return compare(ref);
00332         }
00333         const HashBase& operator[](uint32_t c) const { return getChunkHash(c); }
00334 };
00335 
00336 /**
00337  * Implements concrete HashSet class.
00338  *
00339  * @param HashType      Type of hashes to store in
00340  * @param ChunkSize     Optionally set chunk size
00341  * @param FileHashType  Optionally set file hash type
00342  */
00343 template<class HashType, class FileHashType = HashType, uint32_t ChunkSize = 0>
00344 class HashSet : public HashSetBase {
00345 public:
00346         //! Default constructor
00347         HashSet() : m_chunkSize(ChunkSize) {}
00348 
00349         //! Initialize with custom chunk size
00350         HashSet(uint32_t chunkSize) : m_chunkSize(chunkSize) {}
00351 
00352         //! Construct with existing file hash
00353         HashSet(Hash<FileHashType> h) : m_fileHash(h), m_chunkSize(ChunkSize) {}
00354 
00355         //! Construct and read from stream
00356         HashSet(std::istream &i) : m_chunkSize(ChunkSize) {
00357                 uint16_t tagcount = Utils::getVal<uint16_t>(i);
00358                 while (tagcount--) {
00359                         uint8_t opcode = Utils::getVal<uint8_t>(i);
00360                         uint16_t len = Utils::getVal<uint16_t>(i);
00361                         switch (opcode) {
00362                                 case CGComm::OP_HS_FILEHASH: {
00363                                         if (!m_fileHash.isEmpty()) {
00364                                                 logError(boost::format(
00365                                                         "Multiple filehash "
00366                                                         "tags in HashSet!"
00367                                                 ));
00368                                                 break;
00369                                         }
00370                                         uint8_t typ = Utils::getVal<uint8_t>(i);
00371                                         if (typ != CGComm::OP_HASH) {
00372                                                 logError(boost::format(
00373                                                         "Unexpected symbol %s "
00374                                                         "found at offset %s "
00375                                                         "while loading HashSet."
00376                                                 ) % Utils::hexDump(typ)
00377                                                 % Utils::hexDump(-1+i.tellg()));
00378                                                 break;
00379                                         }
00380                                         Utils::getVal<uint16_t>(i); // length
00381                                         uint8_t id = Utils::getVal<uint8_t>(i);
00382                                         if (id != FileHashType::getTypeId()) {
00383                                                 logError(boost::format(
00384                                                         "Incorrect FileHashType"
00385                                                         " %s found at offset %s"
00386                                                         " in stream.")
00387                                                         % Utils::hexDump(id)
00388                                                         % Utils::hexDump(
00389                                                                 -1+i.tellg()
00390                                                         )
00391                                                 );
00392                                                 break;
00393                                         }
00394                                         m_fileHash = Hash<FileHashType>(i);
00395                                         break;
00396                                 }
00397                                 case CGComm::OP_HS_PARTHASH: {
00398                                         uint8_t typ = Utils::getVal<uint8_t>(i);
00399                                         if (typ != CGComm::OP_HASH) {
00400                                                 logError(boost::format(
00401                                                         "Unexpected symbol %s "
00402                                                         "found at offset %s "
00403                                                         "while loading HashSet."
00404                                                 ) % Utils::hexDump(typ)
00405                                                 % Utils::hexDump(-1+i.tellg()));
00406                                                 break;
00407                                         }
00408                                         Utils::getVal<uint16_t>(i); // length
00409                                         uint8_t id = Utils::getVal<uint8_t>(i);
00410                                         if (id != HashType::getTypeId()) {
00411                                                 logError(boost::format(
00412                                                         "Incorrect "
00413                                                         "ChunkHashType"
00414                                                         " %s found in stream.")
00415                                                         % Utils::hexDump(id)
00416                                                 );
00417                                                 break;
00418                                         }
00419                                         m_hashset.push_back(Hash<HashType>(i));
00420                                         break;
00421                                 }
00422                                 case CGComm::OP_HS_PARTSIZE:
00423                                         m_chunkSize = Utils::getVal<uint32_t>(i);
00424                                         break;
00425                                 default:
00426                                         logError(boost::format(
00427                                                 "Unexpected tag %s found at "
00428                                                 "offset %s while parsing "
00429                                                 "HashSet."
00430                                         ) % Utils::hexDump(opcode)
00431                                         % Utils::hexDump(-1+i.tellg()));
00432                                         i.seekg(len, std::ios::cur);
00433                                         break;
00434                         }
00435                 }
00436         }
00437 
00438         //! Destructor
00439         ~HashSet() {}
00440 
00441         /**
00442          * @name Getters
00443          */
00444         //@{
00445 
00446         /**
00447          * Get the file hash.
00448          *
00449          * @return        File Hash - may be empty.
00450          */
00451         const HashBase& getFileHash() const {
00452                 return m_fileHash;
00453         }
00454 
00455         /**
00456          * Number of chunk hashes
00457          *
00458          * @return       Number of chunk hashes
00459          */
00460         uint32_t getChunkCnt() const { return m_hashset.size(); }
00461 
00462         /**
00463          * Retrieve Nth chunk hash
00464          *
00465          * @param num     Which chunk hash to retrieve.
00466          * @return        The requested chunk hash.
00467          *
00468          * \throws std::runtime_error if num > m_hashset.size()
00469          */
00470         const HashBase& getChunkHash(uint32_t num) const {
00471                 return m_hashset.at(num);
00472         }
00473 
00474         /**
00475          * Retrieve the size of one chunk
00476          *
00477          * @return      Chunk size
00478          */
00479         uint32_t getChunkSize() const { return m_chunkSize; }
00480 
00481         /**
00482          * RTTI - return string identifying the file hash type
00483          *
00484          * @return       String representation of file hash type
00485          */
00486         std::string getFileHashType() const { return FileHashType::getType(); }
00487 
00488         /**
00489          * RTTI - return TypeId of file hash
00490          *
00491          * @return       Enumeration value indicating file hash type
00492          */
00493         CGComm::HashTypeId getFileHashTypeId() const {
00494                 return FileHashType::getTypeId();
00495         }
00496 
00497         /**
00498          * RTTI - return string identifyin the chunk hash type
00499          *
00500          * @return       String representation of chunk hash type
00501          */
00502         std::string getChunkHashType() const { return HashType::getType(); }
00503 
00504         /**
00505          * RTTI - return TypeId of chunk hash
00506          *
00507          * @return       Enumeration value indicating chunk hash type
00508          */
00509         CGComm::HashTypeId getChunkHashTypeId() const {
00510                 return HashType::getTypeId();
00511         }
00512         //@}       // End Getters
00513 
00514         /**
00515          * @name Setters
00516          */
00517         //@{
00518 
00519         /**
00520          * Add a chunk hash
00521          *
00522          * @param h       Chunk hash to add
00523          */
00524         void addChunkHash(Hash<HashType> h) {
00525                 m_hashset.push_back(h);
00526         }
00527 
00528         /**
00529          * Set the file hash, overwriting existing (if any)
00530          *
00531          * @param h      New file hash
00532          */
00533         void setFileHash(Hash<FileHashType> h) {
00534                 m_fileHash = h;
00535         }
00536         //@}       // End Setters
00537 
00538 private:
00539         //! Contains all chunk/chunk/piece hashes. May be empty.
00540         std::vector< Hash<HashType> > m_hashset;
00541 
00542         //! Contains file hash. May be empty.
00543         Hash<FileHashType> m_fileHash;
00544 
00545         //! Size of one chunk/chunk/piece
00546         uint32_t m_chunkSize;
00547 };
00548 
00549 /**
00550  * Commonly used data types
00551  */
00552 //@{
00553 enum Hash_Constants {
00554         /**
00555          * Size of a single file chunk as used in ED2K network. One chunk-hash
00556          * corresponds to each ED2K_PARTSIZE amount of file data. Only full
00557          * chunks of this size may be shared.
00558          */
00559         ED2K_PARTSIZE = 9728000
00560 };
00561 typedef HashSet<MD4Hash, ED2KHash, ED2K_PARTSIZE> ED2KHashSet;
00562 //@}
00563 
00564 #endif