| 15747 |
anikendra |
1 |
/************************************************************************
|
|
|
2 |
The zlib/libpng License
|
|
|
3 |
|
|
|
4 |
Copyright (c) 2006 Joerg Wiedenmann
|
|
|
5 |
|
|
|
6 |
This software is provided 'as-is', without any express or implied warranty.
|
|
|
7 |
In no event will the authors be held liable for any damages arising from
|
|
|
8 |
the use of this software.
|
|
|
9 |
|
|
|
10 |
Permission is granted to anyone to use this software for any purpose,
|
|
|
11 |
including commercial applications, and to alter it and redistribute it
|
|
|
12 |
freely, subject to the following restrictions:
|
|
|
13 |
|
|
|
14 |
1. The origin of this software must not be misrepresented;
|
|
|
15 |
you must not claim that you wrote the original software.
|
|
|
16 |
If you use this software in a product, an acknowledgment
|
|
|
17 |
in the product documentation would be appreciated but is
|
|
|
18 |
not required.
|
|
|
19 |
|
|
|
20 |
2. Altered source versions must be plainly marked as such,
|
|
|
21 |
and must not be misrepresented as being the original software.
|
|
|
22 |
|
|
|
23 |
3. This notice may not be removed or altered from any source distribution.
|
|
|
24 |
|
|
|
25 |
***********************************************************************/
|
|
|
26 |
|
|
|
27 |
/********************************************************************
|
|
|
28 |
created: 2006-01-28
|
|
|
29 |
filename: tokenizer.cpp
|
|
|
30 |
author: Jörg Wiedenmann
|
|
|
31 |
|
|
|
32 |
purpose: A tokenizer function which provides a very
|
|
|
33 |
customizable way of breaking up strings.
|
|
|
34 |
|
|
|
35 |
history: 2006-01-28, Original version
|
|
|
36 |
2006-03-04, Fixed a small parsing bug, thanks Elias.
|
|
|
37 |
*********************************************************************/
|
|
|
38 |
|
|
|
39 |
#include "tokenizer.h"
|
|
|
40 |
|
|
|
41 |
using namespace std;
|
|
|
42 |
|
|
|
43 |
void tokenize ( const string& str, vector<string>& result,
|
|
|
44 |
const string& delimiters, const string& delimiters_preserve,
|
|
|
45 |
const string& quote, const string& esc )
|
|
|
46 |
{
|
|
|
47 |
// clear the vector
|
|
|
48 |
if ( false == result.empty() )
|
|
|
49 |
{
|
|
|
50 |
result.clear();
|
|
|
51 |
}
|
|
|
52 |
|
|
|
53 |
string::size_type pos = 0; // the current position (char) in the string
|
|
|
54 |
char ch = 0; // buffer for the current character
|
|
|
55 |
char delimiter = 0; // the buffer for the delimiter char which
|
|
|
56 |
// will be added to the tokens if the delimiter
|
|
|
57 |
// is preserved
|
|
|
58 |
char current_quote = 0; // the char of the current open quote
|
|
|
59 |
bool quoted = false; // indicator if there is an open quote
|
|
|
60 |
string token; // string buffer for the token
|
|
|
61 |
bool token_complete = false; // indicates if the current token is
|
|
|
62 |
// read to be added to the result vector
|
|
|
63 |
string::size_type len = str.length(); // length of the input-string
|
|
|
64 |
|
|
|
65 |
// for every char in the input-string
|
|
|
66 |
while ( len > pos )
|
|
|
67 |
{
|
|
|
68 |
// get the character of the string and reset the delimiter buffer
|
|
|
69 |
ch = str.at(pos);
|
|
|
70 |
delimiter = 0;
|
|
|
71 |
|
|
|
72 |
// assume ch isn't a delimiter
|
|
|
73 |
bool add_char = true;
|
|
|
74 |
|
|
|
75 |
// check ...
|
|
|
76 |
|
|
|
77 |
// ... if the delimiter is an escaped character
|
|
|
78 |
bool escaped = false; // indicates if the next char is protected
|
|
|
79 |
if ( false == esc.empty() ) // check if esc-chars are provided
|
|
|
80 |
{
|
|
|
81 |
if ( string::npos != esc.find_first_of(ch) )
|
|
|
82 |
{
|
|
|
83 |
// get the escaped char
|
|
|
84 |
++pos;
|
|
|
85 |
if ( pos < len ) // if there are more chars left
|
|
|
86 |
{
|
|
|
87 |
// get the next one
|
|
|
88 |
ch = str.at(pos);
|
|
|
89 |
|
|
|
90 |
// add the escaped character to the token
|
|
|
91 |
add_char = true;
|
|
|
92 |
}
|
|
|
93 |
else // cannot get any more characters
|
|
|
94 |
{
|
|
|
95 |
// don't add the esc-char
|
|
|
96 |
add_char = false;
|
|
|
97 |
}
|
|
|
98 |
|
|
|
99 |
// ignore the remaining delimiter checks
|
|
|
100 |
escaped = true;
|
|
|
101 |
}
|
|
|
102 |
}
|
|
|
103 |
|
|
|
104 |
// ... if the delimiter is a quote
|
|
|
105 |
if ( false == quote.empty() && false == escaped )
|
|
|
106 |
{
|
|
|
107 |
// if quote chars are provided and the char isn't protected
|
|
|
108 |
if ( string::npos != quote.find_first_of(ch) )
|
|
|
109 |
{
|
|
|
110 |
// if not quoted, set state to open quote and set
|
|
|
111 |
// the quote character
|
|
|
112 |
if ( false == quoted )
|
|
|
113 |
{
|
|
|
114 |
quoted = true;
|
|
|
115 |
current_quote = ch;
|
|
|
116 |
|
|
|
117 |
// don't add the quote-char to the token
|
|
|
118 |
add_char = false;
|
|
|
119 |
}
|
|
|
120 |
else // if quote is open already
|
|
|
121 |
{
|
|
|
122 |
// check if it is the matching character to close it
|
|
|
123 |
if ( current_quote == ch )
|
|
|
124 |
{
|
|
|
125 |
// close quote and reset the quote character
|
|
|
126 |
quoted = false;
|
|
|
127 |
current_quote = 0;
|
|
|
128 |
|
|
|
129 |
// don't add the quote-char to the token
|
|
|
130 |
add_char = false;
|
|
|
131 |
}
|
|
|
132 |
} // else
|
|
|
133 |
}
|
|
|
134 |
}
|
|
|
135 |
|
|
|
136 |
// ... if the delimiter isn't preserved
|
|
|
137 |
if ( false == delimiters.empty() && false == escaped &&
|
|
|
138 |
false == quoted )
|
|
|
139 |
{
|
|
|
140 |
// if a delimiter is provided and the char isn't protected by
|
|
|
141 |
// quote or escape char
|
|
|
142 |
if ( string::npos != delimiters.find_first_of(ch) )
|
|
|
143 |
{
|
|
|
144 |
// if ch is a delimiter and the token string isn't empty
|
|
|
145 |
// the token is complete
|
|
|
146 |
if ( false == token.empty() ) // BUGFIX: 2006-03-04
|
|
|
147 |
{
|
|
|
148 |
token_complete = true;
|
|
|
149 |
}
|
|
|
150 |
|
|
|
151 |
// don't add the delimiter to the token
|
|
|
152 |
add_char = false;
|
|
|
153 |
}
|
|
|
154 |
}
|
|
|
155 |
|
|
|
156 |
// ... if the delimiter is preserved - add it as a token
|
|
|
157 |
bool add_delimiter = false;
|
|
|
158 |
if ( false == delimiters_preserve.empty() && false == escaped &&
|
|
|
159 |
false == quoted )
|
|
|
160 |
{
|
|
|
161 |
// if a delimiter which will be preserved is provided and the
|
|
|
162 |
// char isn't protected by quote or escape char
|
|
|
163 |
if ( string::npos != delimiters_preserve.find_first_of(ch) )
|
|
|
164 |
{
|
|
|
165 |
// if ch is a delimiter and the token string isn't empty
|
|
|
166 |
// the token is complete
|
|
|
167 |
if ( false == token.empty() ) // BUGFIX: 2006-03-04
|
|
|
168 |
{
|
|
|
169 |
token_complete = true;
|
|
|
170 |
}
|
|
|
171 |
|
|
|
172 |
// don't add the delimiter to the token
|
|
|
173 |
add_char = false;
|
|
|
174 |
|
|
|
175 |
// add the delimiter
|
|
|
176 |
delimiter = ch;
|
|
|
177 |
add_delimiter = true;
|
|
|
178 |
}
|
|
|
179 |
}
|
|
|
180 |
|
|
|
181 |
|
|
|
182 |
// add the character to the token
|
|
|
183 |
if ( true == add_char )
|
|
|
184 |
{
|
|
|
185 |
// add the current char
|
|
|
186 |
token.push_back( ch );
|
|
|
187 |
}
|
|
|
188 |
|
|
|
189 |
// add the token if it is complete
|
|
|
190 |
if ( true == token_complete && false == token.empty() )
|
|
|
191 |
{
|
|
|
192 |
// add the token string
|
|
|
193 |
result.push_back( token );
|
|
|
194 |
|
|
|
195 |
// clear the contents
|
|
|
196 |
token.clear();
|
|
|
197 |
|
|
|
198 |
// build the next token
|
|
|
199 |
token_complete = false;
|
|
|
200 |
}
|
|
|
201 |
|
|
|
202 |
// add the delimiter
|
|
|
203 |
if ( true == add_delimiter )
|
|
|
204 |
{
|
|
|
205 |
// the next token is the delimiter
|
|
|
206 |
string delim_token;
|
|
|
207 |
delim_token.push_back( delimiter );
|
|
|
208 |
result.push_back( delim_token );
|
|
|
209 |
|
|
|
210 |
// REMOVED: 2006-03-04, Bugfix
|
|
|
211 |
}
|
|
|
212 |
|
|
|
213 |
// repeat for the next character
|
|
|
214 |
++pos;
|
|
|
215 |
} // while
|
|
|
216 |
|
|
|
217 |
// add the final token
|
|
|
218 |
if ( false == token.empty() )
|
|
|
219 |
{
|
|
|
220 |
result.push_back( token );
|
|
|
221 |
}
|
|
|
222 |
}
|