Statistics
| Revision:

svn-gvsig-desktop / branches / org.gvsig.desktop-2018a / org.gvsig.desktop.compat.cdc / org.gvsig.basicformats / src / main / java / org / gvsig / basicformats / impl / DefaultCPGFile.java @ 43876

History | View | Annotate | Download (7.55 KB)

1
package org.gvsig.basicformats.impl;
2

    
3
import java.io.File;
4
import java.io.IOException;
5
import org.apache.commons.io.FileUtils;
6
import org.apache.commons.io.FilenameUtils;
7
import org.apache.commons.lang3.StringUtils;
8
import org.gvsig.basicformats.CPGFile;
9
import org.slf4j.Logger;
10
import org.slf4j.LoggerFactory;
11

    
12

    
13
public class DefaultCPGFile implements CPGFile {
14

    
15
    private static final Logger logger = LoggerFactory.getLogger(DefaultPRJFile.class);
16

    
17

    
18
    private File source;
19
    private String charsetName = null;
20

    
21
    /**
22
     * Define the valid code pages (equivalent to MSDOS code pages). 
23
     * This codes are used on the byte 29 of the DBF header to define the DBF
24
     * codepage.
25
     *
26
     * The equivalences of these charsets using Java NIO charset names are
27
     * defined on the {@link #charsetNames} array (so 0x01 is
28
     * equivalent to IBM437, 0x02 to IBM850, etc)
29
     *
30
     * See some other equivalences in:
31
     * https://github.com/infused/dbf/blob/master/docs/supported_encodings.csv
32
     * https://github.com/olemb/dbfread/blob/master/dbfread/codepages.py
33
     * https://joinup.ec.europa.eu/svn/gvsig-desktop/trunk/libraries/libFMap/src/com/iver/cit/gvsig/fmap/drivers/dbf/DbfEncodings.java
34
     */
35
    private static final short[] codePages = {
36
        0x01, 0x02, 0x03, 0x04,
37
        0x08, 0x09, 0x0a, 0x0b,
38
        0x0d, 0x0e, 0x0f, 0x10,
39
        0x11, 0x12, 0x13, 0x14,
40
        0x15, 0x16, 0x17, 0x18,
41
        0x19, 0x1a, 0x1b, 0x1c,
42
        0x1d, 0x1f, 0x22, 0x23,
43
        0x24, 0x25, 0x26, 0x37,
44
        0x40, 0x4d, 0x4e, 0x4f,
45
        0x50, 0x57, 0x58, 0x59,
46
        0x64, 0x65, 0x66, 0x67,
47
        0x68, 0x69, 0x6a, 0x6b,
48
        0x6c, 0x78, 0x79, 0x7a,
49
        0x7b, 0x7c, 0x7d, 0x7d,
50
        0x86, 0x87, 0x88, 0xc8,
51
        0xc9, 0xca, 0xcb, 0xcc};
52

    
53
    /**
54
     * Equivalent Java charset names to the code pages defined in
55
     * {@link #codePages}, using Java NIO Charset names (which differ
56
     * from JAVA IO names, see
57
     * https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html)
58
     */
59
    private static final String[] charsetNames = new String[]{
60
        "IBM437", "IBM850", "windows-1252", "x-MacRoman",
61
        "IBM865", "IBM437", "IBM850", "IBM437",
62
        "IBM437", "IBM850", "IBM437", "IBM850",
63
        "IBM437", "IBM850", "x-IBM943", "IBM850",
64
        "IBM437", "IBM850", "IBM865", "IBM437",
65
        "IBM437", "IBM850", "IBM437", "IBM863",
66
        "IBM850", "IBM852", "IBM852", "IBM852",
67
        "IBM860", "IBM850", "IBM866", "IBM850",
68
        "IBM852", "x-mswin-936", "x-IBM949", "IBM950",
69
        "x-IBM874", "windows-1252", "windows-1252", "windows-1252",
70
        "IBM852", "IBM866", "IBM865", "IBM861",
71
        // 0x68 and 0x69 are unofficial "Codepage 895 Kamenicky (Czech) MS-DOS" and "Codepage 620  Mazovia (Polish) MS-DOS",
72
        // but there is no Java equivalent
73
        // so we use CP437 which is the closest charset for the latin characters part
74
        "IBM437", "IBM437", "x-IBM737", "IBM857",
75
        "IBM863", "x-IBM950", "x-IBM949", "x-mswin-936",
76
        "x-IBM942", "x-IBM874", "windows-1255", "windows-1256",
77
        "x-IBM737", "IBM852", "IBM857", "windows-1250",
78
        "windows-1251", "windows-1254", "windows-1253", "windows-1257"};
79

    
80
    public DefaultCPGFile() {
81
        this.charsetName = null;
82
        this.source = null;
83
    }
84

    
85
    @Override
86
    public File getFile(File file) {
87
        File f = new File(FilenameUtils.removeExtension(file.getAbsolutePath()) + "." + FILE_EXTENSION);
88
        return f;
89
    }
90

    
91
    @Override
92
    public File getFile() {
93
        return source;
94
    }
95

    
96
    @Override
97
    public String getCharsetName() {
98
        return this.charsetName;
99
    }
100

    
101
    @Override
102
    public void setCharsetName(String charsetName) {
103
        this.charsetName = charsetName;
104
    }
105

    
106
    @Override
107
    public String toCharsetName(String codePageName) {
108
        if (codePageName.equals("UTF8")) {
109
            return "UTF-8";
110
        }
111
        if (codePageName.equals("SJIS")) {
112
            return "Shift_JIS";
113
        }
114

    
115
        if (StringUtils.isNumeric(codePageName)) {
116
            if (codePageName.startsWith("8859") && codePageName.length() > 4) {
117
                return "ISO-8859-" + codePageName.substring(4);
118
            }
119
            if (codePageName.startsWith("125") && codePageName.length() == 4) {
120
                return "windows-" + codePageName;
121
            }
122
            if (codePageName.length() == 3) {
123
                return "IBM-" + codePageName;
124
            }
125
            for (int i = 0; i < charsetNames.length; i++) {
126
                if (charsetNames[i].contains(codePageName)) {
127
                    return codePageName;
128
                }
129
            }
130
        }
131
        if (codePageName.equals("65001")) {
132
            return "UTF-8";
133
        }
134
        return codePageName;
135
    }
136

    
137
    /**
138
     * Gets the Java NIO charset name equivalent to the provided code page.
139
     * Gets null if the provided code page is not recognised
140
     * as a valid code
141
     *
142
     * @param codePage
143
     * @return
144
     */
145
    @Override
146
    public String toCharsetName(int codePage) {
147
        if (codePage != 0) {
148
            for (int i = 0; i < codePages.length; i++) {
149
                if (codePages[i] == codePage) {
150
                    return charsetNames[i];
151
                }
152
            }
153
        }
154
        return null;
155
    }
156

    
157
    @Override
158
    public String toCPGName(String charsetName) {
159
        if (charsetName.startsWith("windows-")
160
                || charsetName.startsWith("ISO-8859")
161
                || charsetName.startsWith("IBM-")
162
                || charsetName.startsWith("x-IBM")
163
                || charsetName.startsWith("x-mswin-")) {
164
            return charsetName.replaceAll("[^\\d]", "");
165
        }
166
        if (charsetName.equals("Shift_JIS")) {
167
            return "SJIS";
168
        }
169
        // For the rest of the charsets, we'll directly write the Java NIO Charset
170
        // Probably they will only be recognized by gvSIG, but it's better than nothing
171
        return charsetName;
172
    }
173

    
174
    /**
175
     * Returns the code page corresponding to the
176
     * provided charset name
177
     *
178
     * @param charsetName
179
     * @return The code page, or 0x00 if no equivalent code page was found for
180
     * the provided charsetName
181
     */
182
    @Override
183
    public int toCPG(String charsetName) {
184
        for (int i = 0; i < charsetNames.length; i++) {
185
            if (charsetNames[i].equals(charsetName)) {
186
                return codePages[i];
187
            }
188
        }
189
        // default
190
        return 0x00;
191
    }
192

    
193
    @Override
194
    public void read(File file) throws IOException {
195
        File f = this.getFile(file);
196
        if (f.exists()) {
197
            try {
198
                String theContents = FileUtils.readFileToString(f);
199
                theContents = StringUtils.trim(theContents);
200
                if (StringUtils.isNotEmpty(theContents)) {
201
                    String theCharset = toCharsetName(theContents);
202
                    this.charsetName = theCharset;
203
                    this.source = f.getAbsoluteFile();
204
                }
205
            } catch (IOException e) {
206
                logger.warn("Couldn't read " + FILE_EXTENSION + " file (" + f.getAbsolutePath() + ").", e);
207
                throw e;
208
            }
209
        }
210
    }
211

    
212
    @Override
213
    public void write(File file) throws IOException {
214
        File f = this.getFile(file);
215
        try {
216
            String export = toCPGName(this.charsetName) + "\n";
217
            FileUtils.writeStringToFile(f, export, "ISO-8859-1");
218
            this.source = f;
219
        } catch (Exception e) {
220
            logger.warn("Couldn't write " + FILE_EXTENSION + " file (" + f.getAbsolutePath() + ").", e);
221
            throw e;
222
        }
223
    }
224

    
225
}