Welcome to the Second Life Forums Archive

These forums are CLOSED. Please visit the new forums HERE

XML Parser - Work in Progress

Christopher Omega
Oxymoron
Join date: 28 Mar 2003
Posts: 1,828
03-12-2005 09:49
Konnichiwa minnasama! :D
Some have expressed interest in my current project, an XML Parser. Its still not finished, I have some major bugs to hammer out of it, but here's what I have so far:

EDIT: GAH! The forum wont let me post messages over 20k characters. The script is about 20738 characters. I'll split it in two, and post it to the next two replies.

Note: It still has debug code in it.
Lemme know what you think :)
==Chris
Christopher Omega
Oxymoron
Join date: 28 Mar 2003
Posts: 1,828
Part 1
03-12-2005 10:06
CODE

// XML Parsing Module
// Written by Christopher Omega

// When triggered by a linked message,
// this script parses the file or string specified
// and triggers various xml_* link messages
// depending upon what text it incounters.
// Currently, it can parse only one thing at a time,
// so it queues requests.

// ====== The Interface ======
// ========== For method invocation ==========
string PARAMETER_SEPERATOR = "|_|";
callMethod(integer callId, string methodName, list parameters) {
llSay(0, "/me - " + llGetScriptName() + ": " + methodName
+ "(" + llList2CSV(parameters) + ")");
//llMessageLinked(llGetLinkNumber(), callId,
// llDumpList2String(parameters, PARAMETER_SEPERATOR), methodName);
}

returnValue(string methodName, integer methodIdentifyer, list value) {
llMessageLinked(llGetLinkNumber(), methodIdentifyer,
llDumpList2String(value, PARAMETER_SEPERATOR), methodName + "_ret");
}

string LIST_SEPERATOR = "|~|";
string encodeList(list src) {
return llDumpList2String(src, LIST_SEPERATOR);
}

list decodeList(string src) {
return llParseStringKeepNulls(src, [LIST_SEPERATOR], []);
}
// =============================================

// Triggered when the parser finishes reading the file.
trigger_endDocument(integer requestId) {
callMethod(0, "xml_endDocument", [requestId]);
}

// Triggered when the parser first starts reading the file.
trigger_startDocument(integer requestId) {
callMethod(0, "xml_startDocument", [requestId]);
}

// Triggered when the parser encounters a XML element (tag)
trigger_startElement(integer requestId, string name,
list attribNames, list attribVals) {
callMethod(0, "xml_startElement", [requestId, name, encodeList(attribNames), encodeList(attribVals)]);
}

// Triggered when the parser encounters an end XML element (tag)
trigger_endElement(integer requestId, string name) {
callMethod(0, "xml_endElement", [requestId, name]);
}

// Triggered when the parser encounters character data. (data not inside a tag)
trigger_characters(integer requestId, string characters) {
callMethod(0, "xml_characters", [requestId, characters]);
}

// Triggered when the parser encounters an error that requires it to
// stop parsing.
trigger_fatalError(integer requestId, integer lineNumber,
integer errorConst, string details) {
callMethod(0, "xml_fatalError", [requestId, lineNumber, errorConst, details]);
}

// Triggered when the parser encounters an error that probably
// invalidates the rest of the data in the document.
trigger_error(integer requestId, integer lineNumber,
integer errorConst, string details) {
callMethod(0, "xml_error", [requestId, lineNumber, errorConst, details]);
}

// Triggered when the parser encounters a recoverable flaw
// in the document, which most likely doesn't invalidate the data
// in the document.
trigger_warning(integer requestId, integer lineNumber,
integer errorConst, string details) {
callMethod(0, "xml_warning", [requestId, lineNumber, errorConst, details]);
}

// ====== Library/Utility Functions: ======
integer subStringFirstIndex(string src, string pattern, integer start) {
if (start > 0)
src = llDeleteSubString(src, 0, start - 1);
integer index = llSubStringIndex(src, pattern);
if (index != -1) {
return index + start;
} else {
return -1;
}
}

// Returns the larger value.
integer max(integer a, integer b) {
if (a > b)
return a;
return b;
}

// Kudos to Jeffery Gomez for these two parseString* functions :-D
list parseStringKeepNulls(string src, list separators, list spacers) {
integer sep_num = llGetListLength(separators);
integer spa_num = llGetListLength(spacers);
string separator;
if (sep_num > 0) {
separator = llList2String(separators, 0);
} else {
// Generate a random string:
do {
separator = (string) llFrand(1822901);
} while (llSubStringIndex(src, separator) != -1);
}
integer top = max(sep_num, spa_num);
integer i;
integer j;
integer k;
for(i = 0; i < top; i += 8) {
j = i + 8;
k = j;
if(j > sep_num) j = sep_num - 1;
if(k > spa_num) k = spa_num - 1;
list test = llParseStringKeepNulls(src, llList2List(separators, i, j), llList2List(spacers, i, k));
src = llDumpList2String(test, separator);
}
return llParseStringKeepNulls(src, [separator], []);
}

list parseString2List(string src, list separators, list spacers) {
integer sep_num = llGetListLength(separators);
integer spa_num = llGetListLength(spacers);
string separator;
if (sep_num > 0) {
separator = llList2String(separators, 0);
} else {
// Generate a random string:
do {
separator = (string) llFrand(1822901);
} while (llSubStringIndex(src, separator) != -1);
}
integer top = max(sep_num, spa_num);
integer i;
integer j;
integer k;
for(i = 0; i < top; i += 8) {
j = i + 8;
k = j;
if(j > sep_num) j = sep_num - 1;
if(k > spa_num) k = spa_num - 1;
list test = llParseString2List(src, llList2List(separators, i, j), llList2List(spacers, i, k));
src = llDumpList2String(test, separator);
}
return llParseString2List(src, [separator], []);
}

// Returns the string, parsed using spacers and seperators
// leaving elements surrounded by the quote character intact.
integer unclosedQuote = FALSE;
list parseQuotesIntact(string src, list seperators, list spacers, string quoteChar, integer keepNulls) {
list ret;
list delimeters = seperators + spacers;
list quoteEnders = [quoteChar];
list quoteStarters;
integer i;
integer len = llGetListLength(delimeters);
// This function only specifies text as quoted if
// the quote character comes after a spacer/seperator.
for (i = 0; i < len; ++i) {
string spacer = llList2String(delimeters, i);
quoteStarters += (spacer + quoteChar);
quoteEnders += (quoteChar + spacer);
}
delimeters += quoteChar;
llSay(0, "delimeters = \"" + llDumpList2String(delimeters, "|") + "\"");
list parsedData;
if (keepNulls) {
parsedData = parseStringKeepNulls(src, [], quoteEnders + quoteStarters + delimeters);
} else {
parsedData = parseString2List(src, [], quoteEnders + quoteStarters + delimeters);
}
llSay(0, "parsedData = " + llDumpList2String(parsedData, "|"));
string betweenQuotes;
integer withinQuote = FALSE;
len = llGetListLength(parsedData);
for (i = 0; i < len; ++i) {
string element = llList2String(parsedData, i);
if (llListFindList(quoteEnders, [element]) != -1 && withinQuote) {
ret += betweenQuotes;
betweenQuotes = "";
withinQuote = FALSE;
} else if (llListFindList(quoteStarters, [element]) != -1 && !withinQuote) {
withinQuote = TRUE;
} else {
if (withinQuote) {
betweenQuotes += element;
} else {
// Dont add it if its a seperator.
if (llListFindList(seperators, [element]) == -1) {
ret += element;
}
}
}
}
// If the string ended still inside a quote, then
// the quote is unclosed.
unclosedQuote = withinQuote;
return ret;
}

// returns TRUE if the list only consists of
// the value specified
integer isListFullOf(list src, list element) {
element = llList2List(element, 0, 0);
integer ret = TRUE;
src = llListSort(src, 1, TRUE);
ret = ret && llListFindList(src, element) == 0;
src = llListSort(src, 1, FALSE);
ret = ret && llListFindList(src, element) == 0;
return ret;
}

// Replaces the element(s) at the specified index.
string NULL = "";
list replaceListSlice(list dest, list src, integer start) {
if (llGetListEntryType(dest, start - 1) == TYPE_INVALID) {
integer len;
for (len = llGetListLength(dest); len < start; len++) {
dest += NULL;
}
}
integer srcLen = llGetListLength(src);
return llListInsertList(llDeleteSubList(dest, start, start + srcLen - 1), src, start);
}

// A list2ListStrided function that obeys its start parameter.
list list2ListStrided(list src, integer start, integer stride) {
if (start > 0)
src = llDeleteSubList(src, 0, start - 1);
return llList2ListStrided(src, 0, -1, stride);
}

// Returns TRUE if str starts with prefix, mostly for readability.
integer strStartsWith(string str, string prefix) {
return llSubStringIndex(str, prefix) == 0;
}

// Replaces all instances of toReplace in str with replaceWith.
string replaceAll(string str, string toReplace, string replaceWith) {
return llDumpList2String(llParseStringKeepNulls(str, [toReplace], []), replaceWith);
}

// ====== Functions specific to this purpose: ======
// Converts all entities in unconverted to literals.
string convertEntities(string unconverted) {
string converted = unconverted;
integer numEntities = llGetListLength(ENTITIES);
integer i;
for (i = 0; i < numEntities; i++) {
converted = replaceAll(converted, llList2String(ENTITIES, i),
llList2String(LITERALS, i));
}
return converted;
}

// Parses the data contained within the tagData global
// returns TRUE if parsing successful (no fatalErrors triggered)
integer dumpTagData() {
llSay(0, "dumpTagData called.");
list parsedTag = parseQuotesIntact(tagData, [" "], ["=", "/"], "\"", FALSE);
llSay(0, "parsedTag = \"" + llDumpList2String(parsedTag, "|") + "\"");
string tagName = llList2String(parsedTag, 0);
if (tagName == "/") {
// This is an end tag.
tagName = llList2String(parsedTag, 1);
trigger_endElement(curId, tagName);
} else {
integer selfContained = llList2String(parsedTag, -1) == "/"; // if it has a / at the end.
list attribData = llDeleteSubList(parsedTag, 0, 0);
if (selfContained)
attribData = llDeleteSubList(attribData, -1, -1); // Remove the / at the end.
list attribNames = list2ListStrided(attribData, 0, 2); // Every other element, starting at the first.
list attribEquals = list2ListStrided(attribData, 1, 2); // Every other element, starting at the second.
list attribValues = list2ListStrided(attribData, 2, 2); // You get the picture.
integer numVals = llGetListLength(attribValues);
// Check each list for consistancy; all should be the same length
// and attribEquals should be filled with = only.
if (numVals != llGetListLength(attribNames) || numVals != llGetListLength(attribEquals)
|| !(isListFullOf(attribEquals, ["="]) || numVals == 0)) {
trigger_error(curId, curLine, PARSING, "Attributes for " + tagName
+ " are defined incorrectly.");
}
llSay(0, "numVals = " + (string) numVals);
llSay(0, "attribNames len = " + (string) llGetListLength(attribNames));
llSay(0, "attribEquals len = " + (string) llGetListLength(attribEquals));
llSay(0, "attribNames = \"" + llDumpList2String(attribNames, "|") + "\"");
// Convert the entities in each value:
integer i;
for (i = 0; i < numVals; i++) {
string value = llList2String(attribValues, i);
value = convertEntities(value);
attribValues = replaceListSlice(attribValues, [value], i);
}
trigger_startElement(curId, tagName, attribNames, attribValues);
if (selfContained) {
trigger_endElement(curId, tagName);
}
}
tagData = "";
return TRUE;
}

// Parses the data contained within the charData global
// returns TRUE if parsing successful (no fatalErrors triggered)
integer dumpCharData() {
if (charData == "")
return TRUE;
trigger_characters(curId, convertEntities(charData));
charData = "";
return TRUE;
}
Christopher Omega
Oxymoron
Join date: 28 Mar 2003
Posts: 1,828
Part 2
03-12-2005 10:07
CODE

// Parses arbitrary XML text data.
// May be stack-intensive, uses recursion liberally,
// returns TRUE if parsing successful (no fatalErrors triggered).
integer parseLine(string lineData) {
llSay(0, "parseLine called, lineData == \"" + lineData + "\"");
llSay(0, "tagData = \"" + tagData + "\"");
llSay(0, "charData = \"" + charData + "\"");
//if (lineData == "")
//return TRUE;
if (!withinTag) {
if (strStartsWith(lineData, "<")) {
withinTag = TRUE;
return parseLine(llDeleteSubString(lineData, 0, 0));
} else if (strStartsWith(lineData, ">")) {
// Yikes, this shouldn't happen.
trigger_error(curId, curLine, PARSING, "Element ended without beginning.");
// Try skipping over it
return parseLine(llDeleteSubString(lineData, 0, 0));
} else {
// Handle character data:
integer tagPosition = llSubStringIndex(lineData, "<");
if (tagPosition == -1) {
charData += lineData + "\n";
} else {
charData += llGetSubString(lineData, 0, tagPosition - 1);
if (!dumpCharData())
return FALSE; // fatalError occured in dumpCharData.
return parseLine(llDeleteSubString(lineData, 0, tagPosition - 1));
}
}
} else {
if (strStartsWith(lineData, "<")) {
// Yikes, this shouldn't happen.
trigger_error(curId, curLine, PARSING, "Element cannot be defined within another element.");
// Try skipping over it.
return parseLine(llDeleteSubString(lineData, 0, 0));
} else if (strStartsWith(lineData, ">")) {
llSay(0, "Tag ended!");
if (!dumpTagData())
return FALSE; // fatalError occured in dumpTagData.
withinTag = FALSE;
return parseLine(llDeleteSubString(lineData, 0, 0));
} else {
// Handle additional tag data:
integer endPosition = llSubStringIndex(lineData, ">");
if (endPosition == -1) {
tagData += lineData + "\n";
} else {
tagData += llGetSubString(lineData, 0, endPosition - 1);
return parseLine(llDeleteSubString(lineData, 0, endPosition - 1));
}
}
}
return TRUE;
}

// Initilizes the first queued parseXMLFile call,
// prepares it for processing and removes it from queue.
integer popRequest() {
if (llGetListEntryType(requestSourceQueue, 0) == TYPE_INVALID)
return FALSE;
// Clear document variables:
tagData = charData = "";
withinTag = FALSE;

// Get next request data.
curSource = llList2String(requestSourceQueue, 0);
curId = llList2Integer(requestIdQueue, 0);
curType = llList2Integer(requestTypeQueue, 0);

// Remove next request from queues.
requestSourceQueue = llDeleteSubList(requestSourceQueue, 0, 0);
requestIdQueue = llDeleteSubList(requestIdQueue, 0, 0);
requestTypeQueue = llDeleteSubList(requestTypeQueue, 0, 0);
if (curType == REQUEST_FILE_READ) {
// Data is a filename.
curLine = 0;
curQuery = llGetNotecardLine(curSource, curLine);
} else {
// Data is actual XML text.
curLine = -1;
parseLine(curSource);
// TODO: Handle EOF stuff.
return popRequest();
}
return TRUE;
}

// Pushes a new call to parseXMLFile onto the queue.
pushRequest(string data, integer type, integer id) {
requestSourceQueue += data;
requestTypeQueue += type;
requestIdQueue += id;
}

// Global constants:
// XML Entities and Literals:
list ENTITIES = ["&lt", "&gt", "&amp", "&apos", "&quot"];
list LITERALS = ["<", ">", "&", "'", "\""];
// Error constants, passed to fatalError, error and warning:
integer PARSING = 1; // Parsing error occured.
integer FILE_NOT_FOUND = 2; // The notecard isnt in objects inv.
// Request types
integer REQUEST_FILE_READ = 1;
integer REQUEST_STRING_READ = 2;
// Global variables:
// Queues, used to queue calls to parseXMLFile, as the script
// can only be handling one at a time.
list requestTypeQueue;
list requestSourceQueue;
list requestIdQueue;
// Info on request currently being processed in the parsing state.
string curSource;
integer curId;
integer curLine;
integer curType;
key curQuery; // Dataserver request key.
// Contains intermittant tag and character data;
// data on the tag/characters currently being parsed.
// Used by parseLine/dumpTagData/dumpCharData functions.
string tagData;
string charData;
integer withinTag = FALSE;

default {
link_message(integer sender, integer num, string parameters, key methodName) {
if (methodName == "parseXMLFile") {
list paramList = llParseStringKeepNulls(parameters, [PARAMETER_SEPERATOR], []);
// Method signature:
// parseXMLFile(integer requestId, string fileName)
integer requestId = (integer) llList2String(paramList, 0);
string fileName = llList2String(paramList, 1);

if (llGetInventoryKey(fileName) != NULL_KEY) {
pushRequest(fileName, REQUEST_FILE_READ, requestId);
state parsing;
} else {
trigger_fatalError(requestId, -1, FILE_NOT_FOUND, fileName
+ " is not accessible");
}
} else if (methodName == "parseXMLString") {
list paramList = llParseStringKeepNulls(parameters, [PARAMETER_SEPERATOR], []);
// Method signature:
// parseXMLString(integer requestId, string str)
integer requestId = (integer) llList2String(paramList, 0);
string str = llList2String(paramList, 1);
pushRequest(str, REQUEST_STRING_READ, requestId);
state parsing;
}
}
}

state parsing {
state_entry() {
// If there aren't any more queued requests:
if (!popRequest())
state default;
}

dataserver(key query, string data) {
if (query == curQuery) {
curQuery = "";
if (data != EOF) {
// If parsing successful:
if (parseLine(data + "\n")) {
// Request the next line:
curQuery = llGetNotecardLine(curSource, ++curLine);
} else {
// Something triggered a fatal error,
// jump to the next request.
if (!popRequest())
state default;
}
} else {
// TODO: Handle EOF stuff, including
// unclosed elements and run-on quotes.
}
}
}

link_message(integer sender, integer num, string parameters, key methodName) {
if (methodName == "parseXMLFile") {
list paramList = llParseStringKeepNulls(parameters, [PARAMETER_SEPERATOR], []);
// Method signature:
// parseXMLFile(integer requestId, string fileName)
integer requestId = (integer) llList2String(paramList, 0);
string fileName = llList2String(paramList, 1);

if (llGetInventoryKey(fileName) != NULL_KEY) {
pushRequest(fileName, REQUEST_FILE_READ, requestId);
} else {
trigger_fatalError(requestId, -1, FILE_NOT_FOUND, fileName
+ " is not accessible");
}
} else if (methodName == "parseXMLString") {
list paramList = llParseStringKeepNulls(parameters, [PARAMETER_SEPERATOR], []);
// Method signature:
// parseXMLString(integer requestId, string str)
integer requestId = (integer) llList2String(paramList, 0);
string str = llList2String(paramList, 1);
pushRequest(str, REQUEST_STRING_READ, requestId);
}
}
}