string - How can I handle special characters in C++ program? -
i trying read multiple text files folder, , store each word's origin position. using boost clear text punctuation.
i encounter problem when words have special characters such (Õ, Ø, æ, etc). in case, error message: "expression: (unsigned)(c+1)<=256".
here code aplication i've mentioned:
#include "stdafx.h" #include <iostream> #include <fstream> #include<iterator> #include<string> #include "/../dirent.h/dirent.h" #include <boost/tokenizer.hpp> using namespace std; using namespace boost; int main() { dir* dir; dirent* pdir; dir = opendir("d:/../dataset/"); int number_of_words=0; int text_length = 30; char filename[300]; int i=0; while (pdir = readdir(dir)) { string filestring; cout<<"-------------------------------------------"<<endl; cout<<"name of text file: "<<pdir->d_name << endl; strcpy(filename, "d:/.../dataset/"); strcat(filename, pdir->d_name); ifstream file(filename); std::istream_iterator<std::string> beg(file), end; number_of_words = distance(beg,end); //cout<<"number of words in file: "<<number_of_words<<endl; ifstream files(filename); //char output[200]; if (file.is_open()) { string output; while (!files.eof()) { files >> output; filestring += " "; filestring += output; //cout<<output<<endl; } //cout<<filestring<<endl; cout<<"number of characters: "<<filestring.size()<<endl; cout<<"-------------------------------------------"<<endl; string filestringtokenized; tokenizer<>tok (filestring); int indice_cuvant_curent = 0; int index = 0; vector<int> myvector; for(tokenizer<>::iterator beg=tok.begin(); beg!=tok.end(); ++beg) { string currentword; currentword = *beg; myvector.push_back(index); index+=currentword.size(); //cout<<index<<"\t"; //cout<<*beg<<endl; filestringtokenized += *beg; } } file.close(); } closedir(dir); homecoming 0; }
why problem appear , how can solve it?
something should work:
#include <iostream> #include <string> #include <vector> #include <boost/tokenizer.hpp> using string = std::wstring; using tokenizer = boost::tokenizer< boost::char_delimiters_separator<string::value_type>, string::const_iterator, string>; int main() { string str(l"Õ, Ø, æ"); tokenizer tok (str); for(tokenizer::iterator beg=tok.begin(); beg!=tok.end(); ++beg) { std::wcout << (*beg) << l'\n'; } }
it uses tokenizer wide characters.
c++ string boost
No comments:
Post a Comment