c++-gtk-utils
reassembler.h
Go to the documentation of this file.
00001 /* Copyright (C) 2005 to 2010 Chris Vine
00002 
00003 The library comprised in this file or of which this file is part is
00004 distributed by Chris Vine under the GNU Lesser General Public
00005 License as follows:
00006 
00007    This library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Lesser General Public License
00009    as published by the Free Software Foundation; either version 2.1 of
00010    the License, or (at your option) any later version.
00011 
00012    This library is distributed in the hope that it will be useful, but
00013    WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Lesser General Public License, version 2.1, for more details.
00016 
00017    You should have received a copy of the GNU Lesser General Public
00018    License, version 2.1, along with this library (see the file LGPL.TXT
00019    which came with this source code package in the c++-gtk-utils
00020    sub-directory); if not, write to the Free Software Foundation, Inc.,
00021    59 Temple Place - Suite 330, Boston, MA, 02111-1307, USA.
00022 
00023 */
00024 
00025 #ifndef CGU_REASSEMBLER_H
00026 #define CGU_REASSEMBLER_H
00027 
00028 #include <c++-gtk-utils/shared_handle.h>
00029 #include <c++-gtk-utils/cgu_config.h>
00030 
00031 namespace Cgu {
00032 
00033 namespace Utf8 {
00034 
00035 
00036 /**
00037  * @class Reassembler reassembler.h c++-gtk-utils/reassembler.h
00038  * @brief A class for reassembling UTF-8 strings sent over pipes and
00039  * sockets so they form complete valid UTF-8 characters.
00040  *
00041  * Utf8::Reassembler is a functor class which takes in a partially
00042  * formed UTF-8 string and returns a null terminated string comprising
00043  * such of the input string (after inserting, at the beginning, any
00044  * partially formed UTF-8 character which was at the end of the input
00045  * string passed in previous calls to the functor) as forms complete
00046  * UTF-8 characters (storing any partial character at the end for the
00047  * next call to the functor).  If the input string contains invalid
00048  * UTF-8 after adding any stored previous part character (apart from
00049  * any partially formed character at the end of the input string) then
00050  * operator() will return a null Cgu::SharedHandle<char*> object (that
00051  * is, Cgu::SharedHandle<char*>::get() will return 0).  Such input
00052  * will not be treated as invalid if it consists only of a single
00053  * partly formed UTF-8 character which could be valid if further bytes
00054  * were received and added to it.  In that case the returned
00055  * SharedHandle<char*> object will contain an allocated string of zero
00056  * length (apart from the terminating 0 character), rather than a NULL
00057  * pointer.
00058  *
00059  * This enables UTF-8 strings to be sent over pipes, sockets, etc and
00060  * displayed in a GTK+ object at the receiving end
00061  *
00062  * Note that for efficiency reasons the memory held in the returned
00063  * Cgu::SharedHandle<char*> object may be greater than the length of
00064  * the null-terminated string that is contained in that memory: just
00065  * let the Cgu::SharedHandle<char*> object manage the memory, and use
00066  * the contents like any other null-terminated string.
00067  *
00068  * This class is not needed if std::getline(), with its default '\\n'
00069  * delimiter, is used to read UTF-8 characters using, say,
00070  * Cgu::fdistream, because a whole '\\n' delimited line of UTF-8
00071  * characters will always be complete.
00072  *
00073  * This is an example of its use, reading from a pipe until it is
00074  * closed by the writer and putting the received text in a
00075  * GtkTextBuffer object:
00076  * @code
00077  *   using namespace Cgu;
00078  *
00079  *   GtkTextIter end;
00080  *   GtkTextBuffer* text_buffer = gtk_text_view_get_buffer(GTK_TEXT_VIEW(text_view));
00081  *   gtk_text_buffer_get_end_iter(text_buffer, &end);
00082  *
00083  *   Utf8::Reassembler reassembler;
00084  *   const int BSIZE = 1024;
00085  *   char read_buffer[BSIZE];
00086  *   ssize_t res;
00087  *   do {
00088  *     res = ::read(fd, read_buffer, BSIZE);
00089  *     if (res > 0) {
00090  *       SharedHandle<char*> utf8(reassembler(read_buffer, res));
00091  *       if (utf8.get()) {
00092  *         gtk_text_buffer_insert(text_buffer, &end,
00093  *                                utf8.get(), std::strlen(utf8));
00094  *       }
00095  *       else std::cerr << "Invalid utf8 text sent over pipe\n";
00096  *     }
00097  *   } while (res && (res != -1 || errno == EINTR));
00098  * @endcode
00099  */
00100 
00101 class Reassembler {
00102   size_t stored;
00103   const static size_t buff_size = 6;
00104   char buffer[buff_size];
00105   char* join_buffer(const char*, size_t);
00106 public:
00107 /**
00108  * Takes a byte array of wholly or partly formed UTF-8 characters to
00109  * be converted (after taking account of previous calls to the method)
00110  * to a valid string of wholly formed characters.
00111  * @param input The input array.
00112  * @param size The number of bytes in the input (not the number of
00113  * UTF-8 characters).
00114  * @return A Cgu::SharedHandle<char*> object holding a null terminated
00115  * string comprising such of the input (after inserting, at the
00116  * beginning, any partially formed UTF-8 character which was at the
00117  * end of the input passed in previous calls to the functor) as forms
00118  * complete UTF-8 characters (storing any partial character at the end
00119  * for the next call to the functor).  If the input is invalid after
00120  * such recombination, then a null Cgu::SharedHandle<char*> object is
00121  * returned (that is, Cgu::SharedHandle<char*>::get() will return 0).
00122  * Such input will not be treated as invalid if it consists only of a
00123  * single partly formed UTF-8 character which could be valid if
00124  * further bytes were received and added to it.  In that case the
00125  * returned Cgu::SharedHandle<char*> object will contain an allocated
00126  * string of zero length (apart from the terminating 0 character),
00127  * rather than a NULL pointer.
00128  * @exception std::bad_alloc The method might throw std::bad_alloc if
00129  * memory is exhausted and the system throws in that case.  It will
00130  * not throw any other exception.
00131  */
00132   Cgu::SharedHandle<char*> operator()(const char* input, size_t size);
00133 
00134 /**
00135  * Gets the number of bytes of a partially formed UTF-8 character
00136  * stored for the next call to operator()().  It will not throw.
00137  * @return The number of bytes.
00138  */
00139   size_t get_stored() const {return stored;}
00140 
00141 /**
00142  * Resets the Reassembler, by discarding any partially formed UTF-8
00143  * character from previous calls to operator()().  It will not throw.
00144  */
00145   void reset() {stored = 0;}
00146 
00147 /**
00148  * The constructor will not throw.
00149  */
00150   Reassembler(): stored(0) {}
00151 
00152 /* Only has effect if --with-glib-memory-slices-compat or
00153  * --with-glib-memory-slices-no-compat option picked */
00154   CGU_GLIB_MEMORY_SLICES_FUNCS
00155 };
00156 
00157 } // namespace Utf8
00158 
00159 } // namespace Cgu
00160 
00161 #endif