add reading from gzipped xml files (xxx.xml.gz)

This commit is contained in:
TangoCash
2018-08-22 21:26:15 +02:00
committed by Thilo Graf
parent bd36134e17
commit 9f9ac9b616
2 changed files with 132 additions and 28 deletions

View File

@@ -41,10 +41,12 @@
#include <libxml/parser.h> #include <libxml/parser.h>
#include <libxml/parserInternals.h> #include <libxml/parserInternals.h>
#else /* USE_LIBXML */ #else /* USE_LIBXML */
#include "gzstream.h"
#include "xmltok.h" #include "xmltok.h"
#endif /* USE_LIBXML */ #endif /* USE_LIBXML */
#include <fcntl.h> #include <fcntl.h>
#include <stdio.h> #include <stdio.h>
#include <zlib.h>
unsigned long xmlGetNumericAttribute(const xmlNodePtr node, const char *name, const int base) unsigned long xmlGetNumericAttribute(const xmlNodePtr node, const char *name, const int base)
{ {
@@ -230,27 +232,103 @@ xmlDocPtr parseXml(const char * data,const char* /*encoding*/)
xmlDocPtr parseXmlFile(const char * filename, bool,const char* encoding) xmlDocPtr parseXmlFile(const char * filename, bool,const char* encoding)
{ {
pugi::xml_encoding enc = pugi::encoding_auto; pugi::xml_encoding enc = pugi::encoding_auto;
if(encoding==NULL){ std::string fn = filename;
std::ifstream in; igzstream inz;
in.open(filename); std::ifstream in;
if (in.is_open()) { bool zipped = (fn.substr(fn.find_last_of(".") + 1) == "gz");
std::string line;
getline(in, line); if(encoding==NULL)
for (std::string::iterator it = line.begin(); it != line.end(); ++ it) {
*it = toupper(*it); if (zipped)
if (line.find("ISO-8859-1",0)!= std::string::npos){ {
enc = pugi::encoding_latin1; inz.open(filename);
if (inz.is_open())
{
std::string line;
getline(inz, line);
for (std::string::iterator it = line.begin(); it != line.end(); ++ it)
*it = toupper(*it);
if (line.find("ISO-8859-1",0)!= std::string::npos)
{
enc = pugi::encoding_latin1;
}
inz.close();
}
}
else
{
in.open(filename);
if (in.is_open())
{
std::string line;
getline(in, line);
for (std::string::iterator it = line.begin(); it != line.end(); ++ it)
*it = toupper(*it);
if (line.find("ISO-8859-1",0)!= std::string::npos)
{
enc = pugi::encoding_latin1;
}
in.close();
} }
in.close();
} }
} }
pugi::xml_document* tree_parser = new pugi::xml_document(); pugi::xml_document* tree_parser = new pugi::xml_document();
if (!tree_parser->load_file(filename, pugi::parse_default, enc))
if (zipped)
{ {
delete tree_parser; int fd = open(filename, O_RDONLY);
return NULL;
uint32_t gzsize = 0;
lseek(fd, -4, SEEK_END);
read(fd, &gzsize, 4);
lseek(fd, 0, SEEK_SET);
gzFile xmlgz_file = gzdopen(fd,"rb");
if (xmlgz_file == NULL)
{
delete tree_parser;
return NULL;
}
gzbuffer(xmlgz_file, 64*1024);
void* buffer = pugi::get_memory_allocation_function()(gzsize);
if (!buffer)
{
gzclose(xmlgz_file);
delete tree_parser;
return NULL;
}
size_t read_size = gzread(xmlgz_file,buffer,gzsize);
if (read_size != gzsize)
{
gzclose(xmlgz_file);
delete tree_parser;
return NULL;
}
gzclose(xmlgz_file);
const pugi::xml_parse_result result = tree_parser->load_buffer_inplace_own(buffer,gzsize, pugi::parse_default, enc);
if (result.status != pugi::xml_parse_status::status_ok)
{
printf("Error: Loading %s (%d)\n", filename, result.status);
delete tree_parser;
return NULL;
}
}
else
{
if (!tree_parser->load_file(filename, pugi::parse_default, enc))
{
delete tree_parser;
return NULL;
}
} }
if (!tree_parser->root()) if (!tree_parser->root())
@@ -295,21 +373,39 @@ xmlDocPtr parseXmlFile(const char * filename, bool warning_by_nonexistence /* =
size_t done; size_t done;
size_t length; size_t length;
FILE* xml_file; FILE* xml_file;
gzFile xmlgz_file;
std::string fn = filename;
bool zipped = (fn.substr(fn.find_last_of(".") + 1) == "gz");
xml_file = fopen(filename, "r"); if (zipped)
if (xml_file == NULL)
{ {
if (warning_by_nonexistence) xmlgz_file = gzopen(filename,"r");
perror(filename); if (xmlgz_file == NULL)
return NULL; {
if (warning_by_nonexistence)
perror(filename);
return NULL;
}
gzbuffer(xmlgz_file, 64*1024);
}
else
{
xml_file = fopen(filename, "r");
if (xml_file == NULL)
{
if (warning_by_nonexistence)
perror(filename);
return NULL;
}
} }
tree_parser = new XMLTreeParser(encoding); tree_parser = new XMLTreeParser(encoding);
do do
{ {
length = fread(buffer, 1, sizeof(buffer), xml_file); if (zipped)
length = gzread(xmlgz_file, buffer, sizeof(buffer));
else
length = fread(buffer, 1, sizeof(buffer), xml_file);
done = length < sizeof(buffer); done = length < sizeof(buffer);
if (!tree_parser->Parse(buffer, length, done)) if (!tree_parser->Parse(buffer, length, done))
@@ -320,17 +416,24 @@ xmlDocPtr parseXmlFile(const char * filename, bool warning_by_nonexistence /* =
tree_parser->ErrorString(tree_parser->GetErrorCode()), tree_parser->ErrorString(tree_parser->GetErrorCode()),
tree_parser->GetCurrentLineNumber()); tree_parser->GetCurrentLineNumber());
fclose(xml_file); if (zipped)
gzclose(xmlgz_file);
else
fclose(xml_file);
delete tree_parser; delete tree_parser;
return NULL; return NULL;
} }
} }
while (!done); while (!done);
if (posix_fadvise(fileno(xml_file), 0, 0, POSIX_FADV_DONTNEED) != 0) if (!zipped)
perror("posix_fadvise FAILED!"); if (posix_fadvise(fileno(xml_file), 0, 0, POSIX_FADV_DONTNEED) != 0)
perror("posix_fadvise FAILED!");
fclose(xml_file); if (zipped)
gzclose(xmlgz_file);
else
fclose(xml_file);
if (!tree_parser->RootNode()) if (!tree_parser->RootNode())
{ {

View File

@@ -127,7 +127,8 @@ neutrino_LDADD = \
-ljpeg \ -ljpeg \
-lutil \ -lutil \
-lOpenThreads \ -lOpenThreads \
-lrt -lpthread -lrt -lpthread \
-lz
if ENABLE_GIFLIB if ENABLE_GIFLIB
neutrino_LDADD += -lgif neutrino_LDADD += -lgif