Skip to content

Instantly share code, notes, and snippets.

@nathan130200
Last active November 30, 2025 22:31
Show Gist options
  • Select an option

  • Save nathan130200/1be1004e2bb04529ea44d7ffd9ffafb7 to your computer and use it in GitHub Desktop.

Select an option

Save nathan130200/1be1004e2bb04529ea44d7ffd9ffafb7 to your computer and use it in GitHub Desktop.
Basic XML parser implementation on top of expat.

XML Parser

Basic XML parsing using libexpat bindings for .NET |

  • Its optimized for memory usage, if you don't listen for any SAX-like events (Start/end tag, comment, etc) will not produce any string from handlers.
  • You can really reset parser internal state.
  • Don't need check end tag because expat already does.
  • Use its internal string pooling (maybe add NameTable would be useful too?).
  • No unsafe operations (fixed pointers, etc). All marshaling using a "safe" context.
  • Library name hints for multiple platforms. Eg: VCPKG compiles as libexpat.dll, other compilers will produce expat.dll instead, same for Unix (so) and Mac systems (dylib)
  • Useful for streaming XML (eg: XMPP protocol). DO NOT try parse an HTML, because HTML syntax is based on SGML (support unclosed tags, some extra chars in attribute names) and XML is an extension of SGML too so. Parsing HTML need an SGML parser instead (except if website have well-formed XML-like syntax, as is, have all opening/closing tags correctly)

Usage

using System.Buffers;
using System.Diagnostics;
using System.Reflection;
using System.Runtime.InteropServices;
using System.Text;
using System.Xml;

static class Program
{
	static void Main(string[] args)
	{
		XmlDocument doc = new();
		XmlElement root = null, current = null;

		using (var parser = new XmlParser())
		{
			parser.OnStartTag += (name, attrs) =>
			{
				Console.WriteLine("StartTag");

				var element = doc.CreateElement(name);

				foreach (var (key, value) in attrs)
					element.SetAttribute(key, value);

				if (root == null)
				{
					root = element;
					doc.AppendChild(root);
				}

				current?.AppendChild(element);
				current = element;
			};

			parser.OnCdata += value =>
			{
				Console.WriteLine("Cdata");
				current?.AppendChild(doc.CreateCDataSection(value));
			};

			parser.OnText += value =>
			{
				Console.WriteLine("Text");
				current?.AppendChild(doc.CreateTextNode(value));
			};

			parser.OnEndTag += name =>
			{
				Console.WriteLine("EndTag");
				if (current.ParentNode is XmlElement e)
					current = e;
			};

			parser.OnComment += value =>
			{
				Console.WriteLine("Comment");
				current?.AppendChild(doc.CreateComment(value));
			};

			using (var fs = File.OpenRead("sample.xml"))
			{
				var bufferSize = 8;
				var nTotalBlocks = Math.Round(fs.Length / (float)bufferSize);
				var nCurrentBlock = 0;

				var buf = new byte[bufferSize];

				while (true)
				{
					int len = fs.Read(buf);

					parser.Parse(buf, len);

					if (len == 0)
						break;

					ReportProgress();
					nCurrentBlock++;
				}

				ReportProgress();

				void ReportProgress()
				{
					Console.Title = $"Write block {(nCurrentBlock)} / {nTotalBlocks} ({((float)fs.Position / fs.Length) * 100f:F2}%)";
				}
			}

		}
		var sb = new StringBuilder();

		var settings = new XmlWriterSettings
		{
			ConformanceLevel = ConformanceLevel.Fragment,
			IndentChars = "  ",
			Indent = false
		};

		using (var writer = XmlWriter.Create(sb, settings))
			root.WriteTo(writer);

		Console.WriteLine(sb);

		Console.ReadKey(true);
	}
}
public sealed class XmlParser : IDisposable
{
nint _parser;
Encoding _encoding;
GCHandle _userData;
volatile bool _disposed;
volatile bool _isCdata;
StringBuilder _cdata;
readonly Lock _syncRoot = new();
public XmlParser(Encoding encoding = default)
{
_encoding = encoding ?? Encoding.UTF8;
_parser = XML_ParserCreate(_encoding.WebName.ToUpper());
if (_parser == 0)
throw new InvalidOperationException("Cannot create expat parser.");
_userData = GCHandle.Alloc(this, GCHandleType.Normal);
Setup(false);
}
public void Dispose()
{
lock (this)
{
if (_disposed)
return;
_disposed = true;
if (_parser != 0)
{
XML_ParserFree(_parser);
_parser = 0;
}
if (_userData.IsAllocated)
_userData.Free();
_encoding = null;
}
}
void ThrowIfDisposed()
=> ObjectDisposedException.ThrowIf(_disposed, this);
void ThrowFailed()
{
var code = _disposed
? 23 // XML_ERROR_UNEXPECTED_STATE
: XML_GetErrorCode(_parser);
var lineNum = _disposed ? 0 : XML_GetCurrentLineNumber(_parser);
var linePos = _disposed ? 0 : XML_GetCurrentColumnNumber(_parser);
var byteIndex = _disposed ? 0 : XML_GetCurrentByteIndex(_parser);
var byteCount = _disposed ? 0 : XML_GetCurrentByteCount(_parser);
throw new XmlException(s_ErrorMessages[code])
{
Data =
{
["Code"] = code,
["LineNumber"] = lineNum,
["LinePosition"] = linePos,
["ByteIndex"] = byteIndex,
["ByteCount"] = byteCount
}
};
}
public void Suspend(bool resumable = true)
{
ThrowIfDisposed();
lock (_syncRoot)
{
if (XML_StopParser(_parser, resumable) == 0)
ThrowFailed();
}
}
public void Resume()
{
ThrowIfDisposed();
if (XML_ResumeParser(_parser) != 1)
ThrowFailed();
}
public void Parse(byte[] buf, int len)
{
ThrowIfDisposed();
lock (_syncRoot)
{
var handle = GCHandle.Alloc(buf, GCHandleType.Pinned);
try
{
if (XML_Parse(_parser, handle.AddrOfPinnedObject(), len, len <= 0) != 1)
ThrowFailed();
}
finally
{
handle.Free();
}
}
}
void Setup(bool reset)
{
if (reset)
{
if (XML_ParserReset(_parser) == 0)
ThrowFailed();
}
XML_SetUserData(_parser, (nint)_userData);
XML_SetElementHandler(_parser, s_OnStartElementHandler, s_OnEndElementHandler);
XML_SetCharacterDataHandler(_parser, s_OnCharacterDataHandler);
XML_SetCdataSectionHandler(_parser, s_OnCdataStartHandler, s_OnCdataEndHandler);
XML_SetCommentHandler(_parser, s_OnCommentHandler);
}
public void Reset()
{
ThrowIfDisposed();
lock (_syncRoot)
Setup(true);
}
public event Action<string, IReadOnlyDictionary<string, string>> OnStartTag;
public event Action<string> OnEndTag;
public event Action<string> OnText;
public event Action<string> OnCdata;
public event Action<string> OnComment;
static nint s_hModule;
static readonly Dictionary<int, string> s_ErrorMessages = [];
static XmlParser()
{
NativeLibrary.SetDllImportResolver(typeof(XmlParser).Assembly, ResolveNativeLibrary);
s_ErrorMessages[0] = "(no error)";
for (int i = 1; i <= 64; i++)
{
var ptr = XML_ErrorString(i);
if (ptr != 0)
s_ErrorMessages[i] = Marshal.PtrToStringAnsi(ptr);
}
}
static readonly IEnumerable<string> s_CommonLibraryNames = [
// Alguns compiladores geram sem prefixo "lib"
// Sufixo _d é para somente DEBUG
#if DEBUG
"expat_d",
"libexpat_d",
#endif
"expat",
"libexpat",
// Extensão .so, .dll e .dylib é tratado pelo sistema.
];
static readonly DllImportSearchPath s_SearchPaths
= Enum.GetValues<DllImportSearchPath>()
.Aggregate(default(DllImportSearchPath), (x, y) => x | y);
static readonly Lock g_SyncRoot = new();
static nint ResolveNativeLibrary(string libraryName, Assembly assembly, DllImportSearchPath? searchPath)
{
if (libraryName == LibraryName) // Dica pro PInvoke carregar a biblioteca certa.
{
lock (g_SyncRoot) // Se não lockar vai da ruim.
{
if (s_hModule == 0) // Não carregou a lib ainda.
{
foreach (var name in s_CommonLibraryNames) // Tenta TODOS os possíveis nomes reservados pro expat.
{
// Se TODOS falharem ou expat não está instalado ou está corrompido (mas isso causaria BadImageException).
if (NativeLibrary.TryLoad(name, assembly, searchPath ?? s_SearchPaths, out var hModule))
{
// Show! Expat carregado.
s_hModule = hModule;
goto _result;
}
}
// Tenta também pelo env.
var libPath = Environment.GetEnvironmentVariable("EXPAT_LIBRARY");
if (File.Exists(libPath))
{
if (NativeLibrary.TryLoad(libPath, assembly, searchPath ?? s_SearchPaths, out var hModule))
{
s_hModule = hModule;
goto _result;
}
}
}
_result:
return s_hModule;
}
}
return 0;
}
#pragma warning disable
const string LibraryName = "@expat";
[DllImport(LibraryName)]
static extern nint XML_ParserCreate(string encoding);
[DllImport(LibraryName)]
static extern int XML_Parse(nint parser, nint buffer, int length, [MarshalAs(UnmanagedType.Bool)] bool final);
[DllImport(LibraryName)]
static extern void XML_ParserFree(nint parser);
[DllImport(LibraryName)]
static extern int XML_ParserReset(nint parser);
[DllImport(LibraryName)]
static extern int XML_StopParser(nint parser, [MarshalAs(UnmanagedType.I1)] bool resumable);
[DllImport(LibraryName)]
static extern int XML_ResumeParser(nint parser);
[DllImport(LibraryName)]
static extern long XML_GetCurrentLineNumber(nint parser);
[DllImport(LibraryName)]
static extern long XML_GetCurrentColumnNumber(nint parser);
[DllImport(LibraryName)]
static extern int XML_GetCurrentByteIndex(nint parser);
[DllImport(LibraryName)]
static extern int XML_GetCurrentByteCount(nint parser);
[DllImport(LibraryName)]
static extern int XML_GetErrorCode(nint parser);
[DllImport(LibraryName)]
static extern int XML_GetSpecifiedAttributeCount(nint parser);
[DllImport(LibraryName)]
static extern void XML_SetUserData(nint parser, nint userData);
[DllImport(LibraryName)]
static extern nint XML_ErrorString(int code);
[DllImport(LibraryName)]
static extern void XML_SetElementHandler(nint parser, XML_StartElementHandler start, XML_EndElementHandler end);
[DllImport(LibraryName)]
static extern void XML_SetCdataSectionHandler(nint parser, XML_CdataSectionHandler start, XML_CdataSectionHandler end);
[DllImport(LibraryName)]
static extern void XML_SetCharacterDataHandler(nint parser, XML_CharacterDataHandler handler);
[DllImport(LibraryName)]
static extern void XML_SetCommentHandler(nint parser, XML_CommentHandler handler);
// -------------------------------------------------- //
[UnmanagedFunctionPointer(CallingConvention.Cdecl)]
delegate void XML_CdataSectionHandler(nint userData);
[UnmanagedFunctionPointer(CallingConvention.Cdecl)]
delegate void XML_StartElementHandler(nint userData, nint tagNamePtr, nint attrListPtr);
[UnmanagedFunctionPointer(CallingConvention.Cdecl)]
delegate void XML_EndElementHandler(nint userData, nint tagNamePtr);
[UnmanagedFunctionPointer(CallingConvention.Cdecl)]
delegate void XML_CommentHandler(nint userData, nint buf);
[UnmanagedFunctionPointer(CallingConvention.Cdecl)]
delegate void XML_CharacterDataHandler(nint userData, nint buffer, int length);
static XmlParser GetParser(nint userData)
{
var result = GCHandle.FromIntPtr(userData).Target as XmlParser;
Debug.Assert(result != null);
return result;
}
static readonly XML_StartElementHandler s_OnStartElementHandler = (userData, tagNamePtr, attrListPtr) =>
{
var state = GetParser(userData);
if (state._disposed)
return;
if (state.OnStartTag == null)
return;
var numAttributes = XML_GetSpecifiedAttributeCount(state._parser);
var attributes = new Dictionary<string, string>(numAttributes / 2);
for (int i = 0; i < numAttributes; i += 2)
{
var attrNamePtr = Marshal.ReadIntPtr(attrListPtr, i * nint.Size);
var attrValuePtr = Marshal.ReadIntPtr(attrListPtr, (i + 1) * nint.Size);
attributes[Marshal.PtrToStringAnsi(attrNamePtr)] = Marshal.PtrToStringAnsi(attrValuePtr);
}
state.OnStartTag(Marshal.PtrToStringAnsi(tagNamePtr), attributes);
};
static readonly XML_EndElementHandler s_OnEndElementHandler = (userData, tagNamePtr) =>
{
var state = GetParser(userData);
if (state._disposed)
return;
state.OnEndTag?.Invoke(Marshal.PtrToStringAnsi(tagNamePtr));
};
static string DecodeStringFromPointer(Encoding enc, nint buf, int len)
{
var temp = ArrayPool<byte>.Shared.Rent(len);
try
{
Marshal.Copy(buf, temp, 0, len);
return enc.GetString(temp, 0, len);
}
finally
{
ArrayPool<byte>.Shared.Return(temp);
}
}
static readonly XML_CommentHandler s_OnCommentHandler = (userData, buf) =>
{
var state = GetParser(userData);
if (state._disposed)
return;
state.OnComment?.Invoke(Marshal.PtrToStringAnsi(buf));
};
static readonly XML_CharacterDataHandler s_OnCharacterDataHandler = (userData, buf, len) =>
{
var state = GetParser(userData);
if (state._disposed)
return;
if (state._isCdata)
{
if (state.OnCdata == null)
return;
state._cdata.Append(DecodeStringFromPointer(state._encoding, buf, len));
}
else
{
state.OnText?.Invoke(DecodeStringFromPointer(state._encoding, buf, len));
}
};
static readonly XML_CdataSectionHandler s_OnCdataStartHandler = (userData) =>
{
var state = GetParser(userData);
state._isCdata = true;
if (state.OnCdata != null)
state._cdata = new();
};
static readonly XML_CdataSectionHandler s_OnCdataEndHandler = (userData) =>
{
var state = GetParser(userData);
state._isCdata = false;
if (state.OnCdata != null)
{
state.OnCdata(state._cdata.ToString());
state._cdata.Clear();
state._cdata = null;
}
};
}
#endregion
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment