我试图解析通过TCP发送到我的C#应用程序的XML消息.遗憾的是,协议无法更改,XML消息也没有分隔,也没有使用长度前缀.此外,字符编码不是固定的,但每条消息都以XML声明开头<?xml>.问题是,如何使用C#一次读取一条XML消息.
到目前为止,我试图将TCP流中的数据读入一个字节数组并通过它来使用它MemoryStream.问题是,缓冲区可能包含多个XML消息,或者第一条消息可能不完整.在这些情况下,我尝试使用XmlReader.Read或解析它时会出现异常XmlDocument.Load,但不幸的是,XmlException这并不能让我区分问题(除了解析本地化的错误字符串).
我试着用XmlReader.Read和计数的数量Element和EndElement节点.这样我就知道我什么时候读完第一条完整的XML消息.
但是,有几个问题.如果缓冲区尚未包含整个消息,我如何区分XmlException实际无效的,格式不正确的消息?换句话说,如果在读取第一个根之前抛出异常EndElement,我该如何决定是否中止连接错误,还是从TCP流中收集更多字节?
如果没有异常发生,则将XmlReader其定位在根的开头EndElement.铸造XmlReader到IXmlLineInfo给我当前LineNumber和LinePosition,但它是不直截了当地让其中的字节位置EndElement真的结束.为了做到这一点,我将不得不字节数组转换为字符串(与XML声明中指定的编码),寻求LineNumber,LinePosition并转换回字节偏移.我尝试使用StreamReader.ReadLine,但流阅读器不提供对当前字节位置的公共访问.
所有这些接缝都非常不优雅且不健壮.我想知道你是否有更好的解决方案的想法.谢谢.
经过一段时间的锁定,我想我可以回答我自己的问题如下(我可能是错的,欢迎更正):
我没有找到可以继续解析第二条 XML 消息的方法XmlReader(至少不能,如果第二条消息有一个XmlDeclaration)。XmlTextReader.ResetState可以做类似的事情,但为此我必须假设所有消息都使用相同的编码。因此我无法XmlReader直接连接到 TcpStream。
关闭后XmlReader,缓冲区不会位于读取器的最后位置。因此不可能关闭阅读器并使用新的阅读器继续下一条消息。我想原因是读者无法成功地寻找每个可能的输入流。
当XmlReader抛出异常时,无法确定它是由于过早的 EOF 还是由于 XML 格式不正确而发生。XmlReader.EOF发生异常时不设置。作为解决方法,我派生了自己的 MemoryBuffer,它将最后一个字节作为单个字节返回。这样我就知道它XmlReader确实对最后一个字节感兴趣,并且以下异常可能是由于消息被截断所致(这有点草率,因为它可能无法检测到每个格式不正确的消息。但是,在向缓冲区,迟早会检测到错误。
我可以将 my 投射XmlReader到IXmlLineInfo接口,该接口可以访问当前节点的LineNumber和。LinePosition因此,在阅读第一条消息后,我记住了这些位置并用它来截断缓冲区。这是真正马虎的部分,因为我必须使用字符编码来获取字节位置。我相信您可以找到下面代码中出现问题的测试用例(例如具有混合编码的内部元素)。但到目前为止,它适用于我的所有测试。
这是我想出的解析器类——它可能有用(我知道,它远非完美......)
class XmlParser {
private byte[] buffer = new byte[0];
public int Length {
get {
return buffer.Length;
}
}
// Append new binary data to the internal data buffer...
public XmlParser Append(byte[] buffer2) {
if (buffer2 != null && buffer2.Length > 0) {
// I know, its not an efficient way to do this.
// The EofMemoryStream should handle a List<byte[]> ...
byte[] new_buffer = new byte[buffer.Length + buffer2.Length];
buffer.CopyTo(new_buffer, 0);
buffer2.CopyTo(new_buffer, buffer.Length);
buffer = new_buffer;
}
return this;
}
// MemoryStream which returns the last byte of the buffer individually,
// so that we know that the buffering XmlReader really locked at the last
// byte of the stream.
// Moreover there is an EOF marker.
private class EofMemoryStream: Stream {
public bool EOF { get; private set; }
private MemoryStream mem_;
public override bool CanSeek {
get {
return false;
}
}
public override bool CanWrite {
get {
return false;
}
}
public override bool CanRead {
get {
return true;
}
}
public override long Length {
get {
return mem_.Length;
}
}
public override long Position {
get {
return mem_.Position;
}
set {
throw new NotSupportedException();
}
}
public override void Flush() {
mem_.Flush();
}
public override long Seek(long offset, SeekOrigin origin) {
throw new NotSupportedException();
}
public override void SetLength(long value) {
throw new NotSupportedException();
}
public override void Write(byte[] buffer, int offset, int count) {
throw new NotSupportedException();
}
public override int Read(byte[] buffer, int offset, int count) {
count = Math.Min(count, Math.Max(1, (int)(Length - Position - 1)));
int nread = mem_.Read(buffer, offset, count);
if (nread == 0) {
EOF = true;
}
return nread;
}
public EofMemoryStream(byte[] buffer) {
mem_ = new MemoryStream(buffer, false);
EOF = false;
}
protected override void Dispose(bool disposing) {
mem_.Dispose();
}
}
// Parses the first xml message from the stream.
// If the first message is not yet complete, it returns null.
// If the buffer contains non-wellformed xml, it ~should~ throw an exception.
// After reading an xml message, it pops the data from the byte array.
public Message deserialize() {
if (buffer.Length == 0) {
return null;
}
Message message = null;
Encoding encoding = Message.default_encoding;
//string xml = encoding.GetString(buffer);
using (EofMemoryStream sbuffer = new EofMemoryStream (buffer)) {
XmlDocument xmlDocument = null;
XmlReaderSettings settings = new XmlReaderSettings();
int LineNumber = -1;
int LinePosition = -1;
bool truncate_buffer = false;
using (XmlReader xmlReader = XmlReader.Create(sbuffer, settings)) {
try {
// Read to the first node (skipping over some element-types.
// Don't use MoveToContent here, because it would skip the
// XmlDeclaration too...
while (xmlReader.Read() &&
(xmlReader.NodeType==XmlNodeType.Whitespace ||
xmlReader.NodeType==XmlNodeType.Comment)) {
};
// Check for XML declaration.
// If the message has an XmlDeclaration, extract the encoding.
switch (xmlReader.NodeType) {
case XmlNodeType.XmlDeclaration:
while (xmlReader.MoveToNextAttribute()) {
if (xmlReader.Name == "encoding") {
encoding = Encoding.GetEncoding(xmlReader.Value);
}
}
xmlReader.MoveToContent();
xmlReader.Read();
break;
}
// Move to the first element.
xmlReader.MoveToContent();
if (xmlReader.EOF) {
return null;
}
// Read the entire document.
xmlDocument = new XmlDocument();
xmlDocument.Load(xmlReader.ReadSubtree());
} catch (XmlException e) {
// The parsing of the xml failed. If the XmlReader did
// not yet look at the last byte, it is assumed that the
// XML is invalid and the exception is re-thrown.
if (sbuffer.EOF) {
return null;
}
throw e;
}
{
// Try to serialize an internal data structure using XmlSerializer.
Type type = null;
try {
type = Type.GetType("my.namespace." + xmlDocument.DocumentElement.Name);
} catch (Exception e) {
// No specialized data container for this class found...
}
if (type == null) {
message = new Message();
} else {
// TODO: reuse the serializer...
System.Xml.Serialization.XmlSerializer ser = new System.Xml.Serialization.XmlSerializer(type);
message = (Message)ser.Deserialize(new XmlNodeReader(xmlDocument));
}
message.doc = xmlDocument;
}
// At this point, the first XML message was sucessfully parsed.
// Remember the lineposition of the current end element.
IXmlLineInfo xmlLineInfo = xmlReader as IXmlLineInfo;
if (xmlLineInfo != null && xmlLineInfo.HasLineInfo()) {
LineNumber = xmlLineInfo.LineNumber;
LinePosition = xmlLineInfo.LinePosition;
}
// Try to read the rest of the buffer.
// If an exception is thrown, another xml message appears.
// This way the xml parser could tell us that the message is finished here.
// This would be prefered as truncating the buffer using the line info is sloppy.
try {
while (xmlReader.Read()) {
}
} catch {
// There comes a second message. Needs workaround for trunkating.
truncate_buffer = true;
}
}
if (truncate_buffer) {
if (LineNumber < 0) {
throw new Exception("LineNumber not given. Cannot truncate xml buffer");
}
// Convert the buffer to a string using the encoding found before
// (or the default encoding).
string s = encoding.GetString(buffer);
// Seek to the line.
int char_index = 0;
while (--LineNumber > 0) {
// Recognize \r , \n , \r\n as newlines...
char_index = s.IndexOfAny(new char[] {'\r', '\n'}, char_index);
// char_index should not be -1 because LineNumber>0, otherwise an RangeException is
// thrown, which is appropriate.
char_index++;
if (s[char_index-1]=='\r' && s.Length>char_index && s[char_index]=='\n') {
char_index++;
}
}
char_index += LinePosition - 1;
var rgx = new System.Text.RegularExpressions.Regex(xmlDocument.DocumentElement.Name + "[ \r\n\t]*\\>");
System.Text.RegularExpressions.Match match = rgx.Match(s, char_index);
if (!match.Success || match.Index != char_index) {
throw new Exception("could not find EndElement to truncate the xml buffer.");
}
char_index += match.Value.Length;
// Convert the character offset back to the byte offset (for the given encoding).
int line1_boffset = encoding.GetByteCount(s.Substring(0, char_index));
// remove the bytes from the buffer.
buffer = buffer.Skip(line1_boffset).ToArray();
} else {
buffer = new byte[0];
}
}
return message;
}
}
Run Code Online (Sandbox Code Playgroud)