Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new character encoding/decoding #35

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
* Fri Jan 29 2016 Mohammad Kolahdouzan <[email protected]> 3.0.0
- replaced all String.getBytes() used for character encoding/decoding with
CharsetEncoder/CharsetDecoder
- removed support for all encodings except utf-8

* Fri Nov 20 2015 Kenneth Kharma <[email protected]> 2.2.0
- only emit System::Startup and System::Shutdown when heartbeats are enabled
- added heartbeat support to emitter groups
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
<groupId>org.lwes</groupId>
<artifactId>lwes-java</artifactId>
<packaging>jar</packaging>
<version>2.2.0</version>
<name>lwes-java</name>
<description>Lightweight event system, java implementation</description>
<url>http://lwes.org</url>
Expand Down Expand Up @@ -336,4 +335,5 @@
</build>
</profile>
</profiles>
<version>3.0.0</version>
</project>
122 changes: 56 additions & 66 deletions src/main/java/org/lwes/ArrayEvent.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Collections;
Expand All @@ -25,18 +24,19 @@
import java.util.TreeSet;

import org.apache.commons.lang3.mutable.MutableInt;
import org.lwes.MemoryPool.Buffer;
import org.lwes.serializer.Deserializer;
import org.lwes.serializer.DeserializerState;
import org.lwes.serializer.Serializer;
import org.lwes.util.EncodedString;
import org.lwes.util.Util;

public final class ArrayEvent extends DefaultEvent {

private static final int SERIALIZED_ENCODING_LENGTH;
private byte[] bytes;
private final DeserializerState tempState = new DeserializerState();
private int length = 3;
private short encoding = DEFAULT_ENCODING;
private static Map<ArrayEventStats, MutableInt> STATS =
new EnumMap<ArrayEventStats, MutableInt>(ArrayEventStats.class);

Expand All @@ -57,10 +57,10 @@ public final class ArrayEvent extends DefaultEvent {
public ArrayEvent() {
bytes = new byte[MAX_MESSAGE_SIZE];
length = getValueListIndex();
setEncoding(DEFAULT_ENCODING);
setEncoding();
updateCreationStats();
}

/**
* All constructors call this aux function once
*/
Expand All @@ -77,7 +77,7 @@ public ArrayEvent(String name) throws EventSystemException {
setEventName(name);
}


/**
* Creates a new event from the given byte array, copying it only if the copy flag is true.
* @param bytes
Expand All @@ -101,7 +101,7 @@ public ArrayEvent(final byte[] bytes, final int len, final boolean copy) {
updateCreationStats();
resetCaches();
}

/**
* Creates a new event, making a copy of the given byte array into a newly allocated buffer
* @param bytes
Expand All @@ -113,21 +113,20 @@ public ArrayEvent(final byte[] bytes) {
public ArrayEvent(final byte[] bytes, boolean copy) {
this(bytes, bytes.length, copy);
}

private ArrayEvent(byte[] bytes, int offset, int length, int excess) {
this.bytes = Arrays.copyOfRange(bytes, offset, offset + length + excess);
this.length = length;
updateCreationStats();
resetCaches();
}

private ArrayEvent(byte[] bytes, int length, short encoding) {
private ArrayEvent(byte[] bytes, int length) {
this();
assert length <= bytes.length;
assert length <= this.bytes.length;
System.arraycopy(bytes, 0, this.bytes, 0, length);
this.length = length;
this.encoding = encoding;
}

@Override
Expand All @@ -141,7 +140,6 @@ public void reset() {
Arrays.fill(bytes, (byte) 0);
length = getValueListIndex();
tempState.reset();
encoding = DEFAULT_ENCODING;
}

@Override
Expand All @@ -160,25 +158,29 @@ public void clear(String key) {

@Override
public void setEventName(String name) {
checkShortStringLength(name, encoding, MAX_EVENT_NAME_SIZE);
checkShortStringLength(name, MAX_EVENT_NAME_SIZE);
final String oldName = getEventName();
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does this even allow an event name to be changed? would be easier if the name were set in the constructor and not changeable. oh well

final String defaultEncodingString = ENCODING_STRINGS[DEFAULT_ENCODING].getEncodingString();
try {
final byte[] oldBytes = oldName.getBytes(defaultEncodingString);
final byte[] newBytes = name.getBytes(defaultEncodingString);
if (oldBytes != newBytes) {
final int numFields = getNumEventAttributes();
final int oldValueListIndex = getValueListIndex();
final int newValueListIndex = oldValueListIndex + newBytes.length - oldBytes.length;
Serializer.serializeUBYTE((short) newBytes.length, bytes, 0);
shiftTail(oldValueListIndex, newValueListIndex);
int offset = Serializer.serializeEVENTWORD(name, bytes, 0);
Serializer.serializeUINT16(numFields, bytes, offset);
}
}
catch (UnsupportedEncodingException e) {
throw new IllegalArgumentException("Unknown Encoding: " + defaultEncodingString);

Buffer oldBytesBuffer = EncodedString.encode(oldName);
final byte[] oldBytes = oldBytesBuffer.getEncoderOutputBuffer().array();
int oldBytesLen = oldBytesBuffer.getEncoderOutputBuffer().position();

Buffer newBytesBuffer = EncodedString.encode(name);
final byte[] newBytes = newBytesBuffer.getEncoderOutputBuffer().array();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how did this check ever work before?

int newBytesLen = newBytesBuffer.getEncoderOutputBuffer().position();

if (!Util.compareByteArrays(oldBytes, oldBytesLen, newBytes, newBytesLen)) {
final int numFields = getNumEventAttributes();
final int oldValueListIndex = getValueListIndex();
final int newValueListIndex = oldValueListIndex + newBytesLen - oldBytesLen;
Serializer.serializeUBYTE((short) newBytesLen, bytes, 0);
shiftTail(oldValueListIndex, newValueListIndex);
int offset = Serializer.serializeEVENTWORD(name, bytes, 0);
Serializer.serializeUINT16(numFields, bytes, offset);
}

MemoryPool.putBack(newBytesBuffer);
MemoryPool.putBack(oldBytesBuffer);
}

/**
Expand All @@ -187,10 +189,10 @@ public void setEventName(String name) {
*/
@Override
public void set(String key, FieldType type, Object value) {
checkShortStringLength(key, encoding, MAX_FIELD_NAME_SIZE);
checkShortStringLength(key, MAX_FIELD_NAME_SIZE);
if (ENCODING.equals(key)) {
if (type == FieldType.INT16) {
setEncoding((Short) value);
setEncoding();
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're going to commit to always using UTF-8, we probably shouldn't bother calling setEncoding() again, even if someone is setting the "enc" field specifically. Maybe we should throw an exception if anything other than UTF-8 is passed in here for "enc". This code will currently allow someone to think they've successfully set the encoding to ISO_8859_1 by setting the "enc" field directly, but the serialized event will be UTF-8

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am waiting for a second opinion for Nick's comment. Should we throw exception if one tries to set the encoding to anything other than utf-8?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should settle on UTF-8 and remove the encoding structures all together.

}
else {
throw new EventSystemException("Attempted to set " + ENCODING + " with type "
Expand All @@ -201,7 +203,7 @@ public void set(String key, FieldType type, Object value) {
else {
if (type == FieldType.STRING || type == FieldType.STRING_ARRAY) {
if (find(ENCODING) < 0) {
setEncoding(encoding);
setEncoding();
}
}
final int fieldIndex = find(key);
Expand All @@ -211,7 +213,7 @@ public void set(String key, FieldType type, Object value) {
final FieldType oldType = FieldType.byToken(bytes[tokenIndex]);
if (oldType == type && type.isConstantSize()) {
// Modify the value in place, requiring no shifts.
Serializer.serializeValue(type, value, encoding, bytes, tokenIndex + 1);
Serializer.serializeValue(type, value, bytes, tokenIndex + 1);
return;
}
clear(key);
Expand All @@ -235,7 +237,7 @@ private void appendField(String key, FieldType type, Object value) {
try {
length += Serializer.serializeATTRIBUTEWORD(key, bytes, length);
length += Serializer.serializeBYTE(type.token, bytes, length);
length += Serializer.serializeValue(type, value, encoding, bytes, length);
length += Serializer.serializeValue(type, value, bytes, length);
setNumEventAttributes(getNumEventAttributes() + 1);
}
catch (ArrayIndexOutOfBoundsException e) {
Expand All @@ -248,28 +250,21 @@ private void appendField(String key, FieldType type, Object value) {
}

@Override
public void setEncoding(short encoding) {
if (encoding < 0 || encoding >= ENCODING_STRINGS.length) {
throw new IllegalArgumentException(
"Unable to set " + ENCODING + " to " + encoding + "; acceptable range is 0<=enc<" +
ENCODING_STRINGS.length);
}

this.encoding = encoding;
public void setEncoding() {
final int fieldCountIndex = getFieldCountIndex();
final int numFields = deserializeUINT16(fieldCountIndex);

tempState.set(fieldCountIndex + 2);
if (numFields == 0) {
// We had no fields at all; just set ENCODING.
appendField(ENCODING, FieldType.INT16, encoding);
appendField(ENCODING, FieldType.INT16, UTF_8);
return;
}
else {
if (ENCODING.equals(Deserializer.deserializeATTRIBUTEWORD(tempState, bytes))) {
if (FieldType.INT16.token == Deserializer.deserializeBYTE(tempState, bytes)) {
// Encoding was already the first field and the right type. Just change the value.
Serializer.serializeINT16(encoding, bytes, tempState.currentIndex());
Serializer.serializeINT16(UTF_8, bytes, tempState.currentIndex());
return;
} else {
// Encoding was the first field, but had the wrong type. Clear it and recreate below.
Expand All @@ -284,7 +279,7 @@ public void setEncoding(short encoding) {
shiftTail(index, index + SERIALIZED_ENCODING_LENGTH + 3);
index += Serializer.serializeATTRIBUTEWORD(ENCODING, bytes, index);
index += Serializer.serializeBYTE(FieldType.INT16.token, bytes, index);
index += Serializer.serializeINT16(encoding, bytes, index);
index += Serializer.serializeINT16(UTF_8, bytes, index);
setNumEventAttributes(getNumEventAttributes() + 1);
}

Expand Down Expand Up @@ -359,21 +354,16 @@ private Object get(FieldType type, int valueIndex) {
}

private Object get(FieldType type, DeserializerState state) {
return Deserializer.deserializeValue(state, bytes, type, encoding);
}

@Override
public short getEncoding() {
return encoding;
return Deserializer.deserializeValue(state, bytes, type);
}

/**
* This reads the encoding from the serialized event, without using the cached
* this.encoding value.
*/
private short readEncoding() {
final Short encodingValue = getInt16(ENCODING);
return encodingValue == null ? DEFAULT_ENCODING : encodingValue;
getInt16(ENCODING); // ignore the encoding
return UTF_8;
}

@Override
Expand Down Expand Up @@ -410,33 +400,35 @@ public void deserialize(ByteBuffer buffer, int length) {
}

private void resetCaches() {
this.encoding = readEncoding();
readEncoding();
}

@Override
public int getBytesSize() {
return length;
}

public int getCapacity() {
return bytes.length;
}

@Override
public Event copy() {
STATS.get(ArrayEventStats.COPIES).increment();
return new ArrayEvent(bytes, length, encoding);
return new ArrayEvent(bytes, length);
}

private int find(String key) {
int count = 0;
Buffer buffer = null;
try {
final byte[] keyBytes = EncodedString.getBytes(key, ENCODING_STRINGS[DEFAULT_ENCODING]);
buffer = EncodedString.encode(key);
for (tempState.set(getValueListIndex()); tempState.currentIndex() < length; ) {
++count;
final int keyIndex = tempState.currentIndex();
final int keyLength = bytes[keyIndex] & 0xff;
if (arrayEquals(bytes, keyIndex + 1, keyLength, keyBytes, 0, keyBytes.length)) {
if (arrayEquals(bytes, keyIndex + 1, keyLength, buffer.getEncoderOutputBuffer().array(),
0, buffer.getEncoderOutputBuffer().position())) {
return keyIndex;
}
else {
Expand All @@ -455,6 +447,8 @@ private int find(String key) {
return -1;
}
finally {
// return the buffer back to the pool
MemoryPool.putBack(buffer);
STATS.get(ArrayEventStats.FINDS).increment();
STATS.get(ArrayEventStats.PARSES).add(count);
}
Expand All @@ -481,7 +475,7 @@ public int getValueByteSize(FieldType type, int valueIndex) {
}
if (type.isArray()) {
final FieldType componentType = type.getComponentType();

if (type.isNullableArray()) {
// array_len + bitset_len + bitset + array
DeserializerState ds = new DeserializerState();
Expand All @@ -498,7 +492,7 @@ public int getValueByteSize(FieldType type, int valueIndex) {
}
return ds.currentIndex() - valueIndex;
}

if (componentType.isConstantSize()) {
return 2 + deserializeUINT16(valueIndex) * componentType.getConstantSize();
} else {
Expand Down Expand Up @@ -543,7 +537,6 @@ public void copyFrom(Event event) {
System.arraycopy(ae.bytes, 0, bytes, 0, ae.length);
length = ae.length;
tempState.reset();
encoding = ae.encoding;
}
else {
super.copyFrom(event);
Expand Down Expand Up @@ -586,13 +579,10 @@ public void swap(ArrayEvent event) {
}
final byte[] tempBytes = bytes;
final int tempLength = length;
final short tempEncoding = encoding;
this.bytes = event.bytes;
this.length = event.length;
this.encoding = event.encoding;
event.bytes = tempBytes;
event.length = tempLength;
event.encoding = tempEncoding;
STATS.get(ArrayEventStats.SWAPS).increment();
}

Expand All @@ -614,7 +604,7 @@ public ArrayEvent trim(int excess) {
return new ArrayEvent(bytes, 0, length, excess);
}


private static boolean arrayEquals(final byte[] b1, int o1, final int l1, final byte[] b2, final int o2, final int l2) {
if (l1 != l2) {
return false;
Expand Down Expand Up @@ -646,7 +636,7 @@ public String toStringDetailed() {
buf.append(String.format("Event name: \"%s\"%n", getEventName()));
buf.append(String.format("Serialized length: %d%n", length));
buf.append(String.format("tempState index: %d%n", tempState.currentIndex()));
buf.append(String.format("Encoding: %s%n", Event.ENCODING_STRINGS[encoding].getEncodingString()));
buf.append(String.format("Encoding: %s%n", UTF_8_NAME));
buf.append(String.format("Number of fields: %d%n", getNumEventAttributes()));
final DeserializerState ds = new DeserializerState();
ds.set(getValueListIndex());
Expand All @@ -667,7 +657,7 @@ public String toStringDetailed() {
throw new Exception("Error when reading field name: " + e.getMessage());
}
try {
value = Deserializer.deserializeValue(ds, bytes, type, encoding);
value = Deserializer.deserializeValue(ds, bytes, type);
}
catch (Exception e) {
throw new Exception("Error when reading field name: " + e.getMessage());
Expand Down Expand Up @@ -711,7 +701,7 @@ private final class ArrayEventFieldAccessor extends DefaultFieldAccessor {
currentValueIndex = Integer.MIN_VALUE;

public void advance() {
// Deserialize name,type eagerly; deserialize value lazily.
// Deserialize name,type eagerly; deserialize value lazily.
currentFieldIndex = nextFieldIndex;
accessorTempState.set(currentFieldIndex);
setName(Deserializer.deserializeATTRIBUTEWORD(accessorTempState, bytes));
Expand Down
Loading